diff --git a/.github/workflows/build_bundle_package.yml b/.github/workflows/build_bundle_package.yml index 950f07ce21a83..cf70eeb1ac1aa 100644 --- a/.github/workflows/build_bundle_package.yml +++ b/.github/workflows/build_bundle_package.yml @@ -52,7 +52,6 @@ jobs: source /opt/rh/devtoolset-9/enable && \ source /opt/gluten//dev/vcpkg/env.sh && \ cd $GITHUB_WORKSPACE/ && \ - sed -i '/^headers/d' ep/build-velox/build/velox_ep/CMakeLists.txt && \ export NUM_THREADS=4 ./dev/builddeps-veloxbe.sh --build_tests=OFF --build_benchmarks=OFF --enable_s3=OFF \ --enable_gcs=OFF --enable_hdfs=ON --enable_abfs=OFF diff --git a/.github/workflows/clickhouse_be_trigger.yml b/.github/workflows/clickhouse_be_trigger.yml index f19a328adcb34..19f9b55a03031 100644 --- a/.github/workflows/clickhouse_be_trigger.yml +++ b/.github/workflows/clickhouse_be_trigger.yml @@ -34,17 +34,21 @@ on: jobs: add-comment: runs-on: ubuntu-latest + permissions: write-all steps: - name: Checkout code uses: actions/checkout@v4 - name: Sleep for Dev PR workflow done run: | sleep 15 - - name: Add comment to PR - env: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - run: | - COMMENT="Run Gluten Clickhouse CI" - URL=$(jq -r .pull_request.comments_url "$GITHUB_EVENT_PATH") - curl -H "Authorization: token ${GITHUB_TOKEN}" -X POST -d "{\"body\":\"$COMMENT\"}" "${URL}" + uses: actions/github-script@v7 + with: + github-token: ${{ secrets.GITHUB_TOKEN }} + script: | + await github.rest.issues.createComment({ + owner: context.repo.owner, + repo: context.repo.repo, + issue_number: context.payload.number, + body: "Run Gluten Clickhouse CI" + }); diff --git a/.github/workflows/dev_cron.yml b/.github/workflows/dev_cron.yml index 48ca21510fd98..193549cc077d9 100644 --- a/.github/workflows/dev_cron.yml +++ b/.github/workflows/dev_cron.yml @@ -27,15 +27,16 @@ jobs: process: name: Process runs-on: ubuntu-latest + permissions: write-all steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 - name: Comment Issues link if: | github.event_name == 'pull_request_target' && (github.event.action == 'opened' || github.event.action == 'edited') - uses: actions/github-script@v3 + uses: actions/github-script@v7 with: github-token: ${{ secrets.GITHUB_TOKEN }} script: | @@ -47,7 +48,7 @@ jobs: github.event_name == 'pull_request_target' && (github.event.action == 'opened' || github.event.action == 'edited') - uses: actions/github-script@v3 + uses: actions/github-script@v7 with: github-token: ${{ secrets.GITHUB_TOKEN }} script: | diff --git a/.github/workflows/dev_cron/issues_link.js b/.github/workflows/dev_cron/issues_link.js index 596bad758532c..e47ecb50a55ab 100644 --- a/.github/workflows/dev_cron/issues_link.js +++ b/.github/workflows/dev_cron/issues_link.js @@ -35,7 +35,7 @@ async function haveComment(github, context, pullRequestNumber, body) { page: 1 }; while (true) { - const response = await github.issues.listComments(options); + const response = await github.rest.issues.listComments(options); if (response.data.some(comment => comment.body === body)) { return true; } @@ -52,7 +52,7 @@ async function commentISSUESURL(github, context, pullRequestNumber, issuesID) { if (await haveComment(github, context, pullRequestNumber, issuesURL)) { return; } - await github.issues.createComment({ + await github.rest.issues.createComment({ owner: context.repo.owner, repo: context.repo.repo, issue_number: pullRequestNumber, diff --git a/.github/workflows/dev_cron/title_check.js b/.github/workflows/dev_cron/title_check.js index e553e20b025e8..1e6df340f2f2b 100644 --- a/.github/workflows/dev_cron/title_check.js +++ b/.github/workflows/dev_cron/title_check.js @@ -25,7 +25,7 @@ function haveISSUESID(title) { } async function commentOpenISSUESIssue(github, context, pullRequestNumber) { - const {data: comments} = await github.issues.listComments({ + const {data: comments} = await github.rest.issues.listComments({ owner: context.repo.owner, repo: context.repo.repo, issue_number: pullRequestNumber, @@ -36,7 +36,7 @@ async function commentOpenISSUESIssue(github, context, pullRequestNumber) { } const commentPath = ".github/workflows/dev_cron/title_check.md"; const comment = fs.readFileSync(commentPath).toString(); - await github.issues.createComment({ + await github.rest.issues.createComment({ owner: context.repo.owner, repo: context.repo.repo, issue_number: pullRequestNumber, diff --git a/.github/workflows/docker_image.yml b/.github/workflows/docker_image.yml new file mode 100644 index 0000000000000..6a5697e3aae40 --- /dev/null +++ b/.github/workflows/docker_image.yml @@ -0,0 +1,50 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name: Build and Push Docker Image + +on: + push: + branches: + - main + paths: + - '.github/workflows/docker_image.yml' + schedule: + - cron: '0 20 * * 0' + +jobs: + build: + runs-on: ubuntu-latest + + steps: + - name: Checkout repository + uses: actions/checkout@v2 + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v1 + + - name: Login to Docker Hub + uses: docker/login-action@v2 + with: + username: ${{ secrets.DOCKERHUB_USER }} + password: ${{ secrets.DOCKERHUB_TOKEN }} + + - name: Build and push Docker image + uses: docker/build-push-action@v2 + with: + context: . + file: dev/vcpkg/docker/Dockerfile.gha + push: true + tags: apache/gluten:vcpkg-centos-7 diff --git a/.github/workflows/util/install_spark_resources.sh b/.github/workflows/util/install_spark_resources.sh new file mode 100755 index 0000000000000..e1645b170dd57 --- /dev/null +++ b/.github/workflows/util/install_spark_resources.sh @@ -0,0 +1,93 @@ +#!/bin/bash +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Download Spark resources, required by some Spark UTs. The resource path should be set +# for spark.test.home in mvn test. + +set -e + +INSTALL_DIR=$GITHUB_WORKSPACE +case "$1" in +3.2) + # Spark-3.2 + cd ${INSTALL_DIR} && \ + wget -nv https://archive.apache.org/dist/spark/spark-3.2.2/spark-3.2.2-bin-hadoop3.2.tgz && \ + tar --strip-components=1 -xf spark-3.2.2-bin-hadoop3.2.tgz spark-3.2.2-bin-hadoop3.2/jars/ && \ + rm -rf spark-3.2.2-bin-hadoop3.2.tgz && \ + mkdir -p ${INSTALL_DIR}/shims/spark32/spark_home/assembly/target/scala-2.12 && \ + mv jars ${INSTALL_DIR}/shims/spark32/spark_home/assembly/target/scala-2.12 && \ + wget -nv https://github.com/apache/spark/archive/refs/tags/v3.2.2.tar.gz && \ + tar --strip-components=1 -xf v3.2.2.tar.gz spark-3.2.2/sql/core/src/test/resources/ && \ + mkdir -p shims/spark32/spark_home/ && \ + mv sql shims/spark32/spark_home/ + ;; +3.3) + # Spark-3.3 + cd ${INSTALL_DIR} && \ + wget -nv https://archive.apache.org/dist/spark/spark-3.3.1/spark-3.3.1-bin-hadoop3.tgz && \ + tar --strip-components=1 -xf spark-3.3.1-bin-hadoop3.tgz spark-3.3.1-bin-hadoop3/jars/ && \ + rm -rf spark-3.3.1-bin-hadoop3.tgz && \ + mkdir -p ${INSTALL_DIR}/shims/spark33/spark_home/assembly/target/scala-2.12 && \ + mv jars ${INSTALL_DIR}/shims/spark33/spark_home/assembly/target/scala-2.12 && \ + wget -nv https://github.com/apache/spark/archive/refs/tags/v3.3.1.tar.gz && \ + tar --strip-components=1 -xf v3.3.1.tar.gz spark-3.3.1/sql/core/src/test/resources/ && \ + mkdir -p shims/spark33/spark_home/ && \ + mv sql shims/spark33/spark_home/ + ;; +3.4) + # Spark-3.4 + cd ${INSTALL_DIR} && \ + wget -nv https://archive.apache.org/dist/spark/spark-3.4.2/spark-3.4.2-bin-hadoop3.tgz && \ + tar --strip-components=1 -xf spark-3.4.2-bin-hadoop3.tgz spark-3.4.2-bin-hadoop3/jars/ && \ + rm -rf spark-3.4.2-bin-hadoop3.tgz && \ + mkdir -p ${INSTALL_DIR}/shims/spark34/spark_home/assembly/target/scala-2.12 && \ + mv jars ${INSTALL_DIR}/shims/spark34/spark_home/assembly/target/scala-2.12 && \ + wget -nv https://github.com/apache/spark/archive/refs/tags/v3.4.2.tar.gz && \ + tar --strip-components=1 -xf v3.4.2.tar.gz spark-3.4.2/sql/core/src/test/resources/ && \ + mkdir -p shims/spark34/spark_home/ && \ + mv sql shims/spark34/spark_home/ + ;; +3.5) + # Spark-3.5 + cd ${INSTALL_DIR} && \ + wget -nv https://archive.apache.org/dist/spark/spark-3.5.1/spark-3.5.1-bin-hadoop3.tgz && \ + tar --strip-components=1 -xf spark-3.5.1-bin-hadoop3.tgz spark-3.5.1-bin-hadoop3/jars/ && \ + rm -rf spark-3.5.1-bin-hadoop3.tgz && \ + mkdir -p ${INSTALL_DIR}/shims/spark35/spark_home/assembly/target/scala-2.12 && \ + mv jars ${INSTALL_DIR}/shims/spark35/spark_home/assembly/target/scala-2.12 && \ + wget -nv https://github.com/apache/spark/archive/refs/tags/v3.5.1.tar.gz && \ + tar --strip-components=1 -xf v3.5.1.tar.gz spark-3.5.1/sql/core/src/test/resources/ && \ + mkdir -p shims/spark35/spark_home/ && \ + mv sql shims/spark35/spark_home/ + ;; +3.5-scala2.13) + # Spark-3.5, scala 2.13 + cd ${INSTALL_DIR} && \ + wget -nv https://archive.apache.org/dist/spark/spark-3.5.1/spark-3.5.1-bin-hadoop3.tgz && \ + tar --strip-components=1 -xf spark-3.5.1-bin-hadoop3.tgz spark-3.5.1-bin-hadoop3/jars/ && \ + rm -rf spark-3.5.1-bin-hadoop3.tgz && \ + mkdir -p ${INSTALL_DIR}/shims/spark35/spark_home/assembly/target/scala-2.13 && \ + mv jars ${INSTALL_DIR}/shims/spark35/spark_home/assembly/target/scala-2.13 && \ + wget -nv https://github.com/apache/spark/archive/refs/tags/v3.5.1.tar.gz && \ + tar --strip-components=1 -xf v3.5.1.tar.gz spark-3.5.1/sql/core/src/test/resources/ && \ + mkdir -p shims/spark35/spark_home/ && \ + mv sql shims/spark35/spark_home/ + ;; +*) + echo "Spark version is expected to be specified." + exit 1 + ;; +esac diff --git a/.github/workflows/util/setup_helper.sh b/.github/workflows/util/setup_helper.sh new file mode 100644 index 0000000000000..8b41d83264165 --- /dev/null +++ b/.github/workflows/util/setup_helper.sh @@ -0,0 +1,32 @@ +#!/bin/bash +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -e + +function install_maven { + ( + cd /opt/ + wget -nv https://downloads.apache.org/maven/maven-3/3.8.8/binaries/apache-maven-3.8.8-bin.tar.gz + tar -xvf apache-maven-3.8.8-bin.tar.gz && mv apache-maven-3.8.8 /usr/lib/maven + ) + echo "PATH=${PATH}:/usr/lib/maven/bin" >> $GITHUB_ENV +} + +for cmd in "$@" +do + echo "Running: $cmd" + "$cmd" +done diff --git a/.github/workflows/velox_docker.yml b/.github/workflows/velox_backend.yml similarity index 69% rename from .github/workflows/velox_docker.yml rename to .github/workflows/velox_backend.yml index f3066b7baae2d..fc375c666a4f2 100644 --- a/.github/workflows/velox_docker.yml +++ b/.github/workflows/velox_backend.yml @@ -18,7 +18,7 @@ name: Velox backend Github Runner on: pull_request: paths: - - '.github/workflows/velox_docker.yml' + - '.github/workflows/velox_backend.yml' - 'pom.xml' - 'backends-velox/**' - 'gluten-uniffle/**' @@ -43,6 +43,8 @@ on: env: ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true MVN_CMD: 'mvn -ntp' + WGET_CMD: 'wget -nv' + SETUP: 'bash .github/workflows/util/setup_helper.sh' concurrency: group: ${{ github.repository }}-${{ github.head_ref || github.sha }}-${{ github.workflow }} @@ -51,35 +53,33 @@ concurrency: jobs: build-native-lib-centos-7: runs-on: ubuntu-20.04 - container: apache/gluten:gluten-vcpkg-builder_2024_08_05 # centos7 with dependencies installed + container: apache/gluten:vcpkg-centos-7 steps: - uses: actions/checkout@v2 - name: Generate cache key run: | - echo ${{ hashFiles('./ep/build-velox/src/**', './dev/**', './cpp/*', './github/workflows/*') }} > cache-key + echo ${{ hashFiles('./ep/build-velox/src/**', './dev/**', './cpp/*', './.github/workflows/*') }} > cache-key - name: Cache id: cache uses: actions/cache/restore@v3 with: path: | ./cpp/build/releases/ - /root/.m2/repository/org/apache/arrow/ key: cache-velox-build-centos-7-${{ hashFiles('./cache-key') }} - name: Build Gluten native libraries if: ${{ steps.cache.outputs.cache-hit != 'true' }} run: | df -a + cd $GITHUB_WORKSPACE/ bash dev/ci-velox-buildstatic-centos-7.sh - - name: Upload Artifact Native - uses: actions/upload-artifact@v2 + - uses: actions/upload-artifact@v2 with: - path: ./cpp/build/releases/ name: velox-native-lib-centos-7-${{github.sha}} - - name: Upload Artifact Arrow Jar - uses: actions/upload-artifact@v2 + path: ./cpp/build/releases/ + - uses: actions/upload-artifact@v2 with: + name: arrow-jars-centos-7-${{github.sha}} path: /root/.m2/repository/org/apache/arrow/ - name: velox-arrow-jar-centos-7-${{github.sha}} run-tpc-test-ubuntu: needs: build-native-lib-centos-7 @@ -119,7 +119,7 @@ jobs: - name: Download All Arrow Jar Artifacts uses: actions/download-artifact@v2 with: - name: velox-arrow-jar-centos-7-${{github.sha}} + name: arrow-jars-centos-7-${{github.sha}} path: /root/.m2/repository/org/apache/arrow/ - name: Setup tzdata run: | @@ -190,7 +190,7 @@ jobs: - name: Download All Arrow Jar Artifacts uses: actions/download-artifact@v2 with: - name: velox-arrow-jar-centos-7-${{github.sha}} + name: arrow-jars-centos-7-${{github.sha}} path: /root/.m2/repository/org/apache/arrow/ - name: Update mirror list run: | @@ -207,12 +207,9 @@ jobs: else yum update -y && yum install -y java-1.8.0-openjdk-devel wget fi - wget https://downloads.apache.org/maven/maven-3/3.8.8/binaries/apache-maven-3.8.8-bin.tar.gz - tar -xvf apache-maven-3.8.8-bin.tar.gz - mv apache-maven-3.8.8 /usr/lib/maven + $SETUP install_maven - name: Set environment variables run: | - echo "PATH=${PATH}:/usr/lib/maven/bin" >> $GITHUB_ENV if [ "${{ matrix.java }}" = "java-17" ]; then echo "JAVA_HOME=/usr/lib/jvm/java-17-openjdk" >> $GITHUB_ENV elif [ "${{ matrix.java }}" = "java-11" ]; then @@ -275,7 +272,7 @@ jobs: - name: Download All Arrow Jar Artifacts uses: actions/download-artifact@v2 with: - name: velox-arrow-jar-centos-7-${{github.sha}} + name: arrow-jars-centos-7-${{github.sha}} path: /home/runner/.m2/repository/org/apache/arrow/ - name: Setup java and maven run: | @@ -336,7 +333,7 @@ jobs: -d=FLUSH_MODE:DISABLED,spark.gluten.sql.columnar.backend.velox.flushablePartialAggregation=false,spark.gluten.sql.columnar.backend.velox.maxPartialAggregationMemoryRatio=1.0,spark.gluten.sql.columnar.backend.velox.maxExtendedPartialAggregationMemoryRatio=1.0,spark.gluten.sql.columnar.backend.velox.abandonPartialAggregationMinPct=100,spark.gluten.sql.columnar.backend.velox.abandonPartialAggregationMinRows=0 \ -d=FLUSH_MODE:ABANDONED,spark.gluten.sql.columnar.backend.velox.maxPartialAggregationMemoryRatio=1.0,spark.gluten.sql.columnar.backend.velox.maxExtendedPartialAggregationMemoryRatio=1.0,spark.gluten.sql.columnar.backend.velox.abandonPartialAggregationMinPct=0,spark.gluten.sql.columnar.backend.velox.abandonPartialAggregationMinRows=0 \ -d=FLUSH_MODE:FLUSHED,spark.gluten.sql.columnar.backend.velox.maxPartialAggregationMemoryRatio=0.05,spark.gluten.sql.columnar.backend.velox.maxExtendedPartialAggregationMemoryRatio=0.1,spark.gluten.sql.columnar.backend.velox.abandonPartialAggregationMinPct=100,spark.gluten.sql.columnar.backend.velox.abandonPartialAggregationMinRows=0 - - name: (To be fixed) TPC-DS SF30.0 Parquet local spark3.2 Q23A/Q23B low memory, memory isolation on # Disabled as error https://gist.github.com/zhztheplayer/abd5e83ccdc48730678ae7ebae479fcc + - name: TPC-DS SF30.0 Parquet local spark3.2 Q23A/Q23B low memory, memory isolation on run: | cd tools/gluten-it \ && GLUTEN_IT_JVM_ARGS=-Xmx3G sbin/gluten-it.sh parameterized \ @@ -346,7 +343,7 @@ jobs: -d=OFFHEAP_SIZE:2g,spark.memory.offHeap.size=2g \ -d=FLUSH_MODE:DISABLED,spark.gluten.sql.columnar.backend.velox.flushablePartialAggregation=false,spark.gluten.sql.columnar.backend.velox.maxPartialAggregationMemoryRatio=1.0,spark.gluten.sql.columnar.backend.velox.maxExtendedPartialAggregationMemoryRatio=1.0,spark.gluten.sql.columnar.backend.velox.abandonPartialAggregationMinPct=100,spark.gluten.sql.columnar.backend.velox.abandonPartialAggregationMinRows=0 \ -d=FLUSH_MODE:ABANDONED,spark.gluten.sql.columnar.backend.velox.maxPartialAggregationMemoryRatio=1.0,spark.gluten.sql.columnar.backend.velox.maxExtendedPartialAggregationMemoryRatio=1.0,spark.gluten.sql.columnar.backend.velox.abandonPartialAggregationMinPct=0,spark.gluten.sql.columnar.backend.velox.abandonPartialAggregationMinRows=0 \ - -d=FLUSH_MODE:FLUSHED,spark.gluten.sql.columnar.backend.velox.maxPartialAggregationMemoryRatio=0.05,spark.gluten.sql.columnar.backend.velox.maxExtendedPartialAggregationMemoryRatio=0.1,spark.gluten.sql.columnar.backend.velox.abandonPartialAggregationMinPct=100,spark.gluten.sql.columnar.backend.velox.abandonPartialAggregationMinRows=0 || true + -d=FLUSH_MODE:FLUSHED,spark.gluten.sql.columnar.backend.velox.maxPartialAggregationMemoryRatio=0.05,spark.gluten.sql.columnar.backend.velox.maxExtendedPartialAggregationMemoryRatio=0.1,spark.gluten.sql.columnar.backend.velox.abandonPartialAggregationMinPct=100,spark.gluten.sql.columnar.backend.velox.abandonPartialAggregationMinRows=0 - name: TPC-DS SF30.0 Parquet local spark3.2 Q97 low memory run: | cd tools/gluten-it \ @@ -387,7 +384,7 @@ jobs: - name: Download All Arrow Jar Artifacts uses: actions/download-artifact@v2 with: - name: velox-arrow-jar-centos-7-${{github.sha}} + name: arrow-jars-centos-7-${{github.sha}} path: /home/runner/.m2/repository/org/apache/arrow/ - name: Setup java and maven run: | @@ -410,58 +407,6 @@ jobs: --local --preset=velox --benchmark-type=ds --error-on-memleak -s=30.0 --off-heap-size=8g --threads=12 --shuffle-partitions=72 --iterations=1 \ --data-gen=skip --random-kill-tasks --no-session-reuse - # run-tpc-test-ubuntu-sf30: - # needs: build-native-lib-centos-7 - # strategy: - # fail-fast: false - # matrix: - # spark: [ "spark-3.4" ] - # shard: [ "1/4", "2/4", "3/4", "4/4" ] - # runs-on: ubuntu-20.04 - # steps: - # - name: Maximize build disk space - # shell: bash - # run: | - # df -h - # set -euo pipefail - # echo "Removing unwanted software... " - # sudo rm -rf /usr/share/dotnet - # sudo rm -rf /usr/local/lib/android - # sudo rm -rf /opt/ghc - # sudo rm -rf /opt/hostedtoolcache/CodeQL - # sudo docker image prune --all --force > /dev/null - # df -h - # - uses: actions/checkout@v2 - # - name: Download All Artifacts - # uses: actions/download-artifact@v2 - # with: - # name: velox-native-lib-centos-7-${{github.sha}} - # path: ./cpp/build/releases - # - name: Setup java and maven - # run: | - # sudo apt-get update - # sudo apt-get install -y openjdk-8-jdk maven - # - name: Set environment variables - # run: | - # echo "JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64" >> $GITHUB_ENV - # - name: Build for Spark ${{ matrix.spark }} - # run: | - # cd $GITHUB_WORKSPACE/ - # $MVN_CMD clean install -P${{ matrix.spark }} -Pbackends-velox -DskipTests - # cd $GITHUB_WORKSPACE/tools/gluten-it - # $MVN_CMD clean install -P${{ matrix.spark }} - # GLUTEN_IT_JVM_ARGS=-Xmx6G sbin/gluten-it.sh data-gen-only --local --benchmark-type=h -s=30.0 --threads=12 - # GLUTEN_IT_JVM_ARGS=-Xmx6G sbin/gluten-it.sh data-gen-only --local --benchmark-type=ds -s=30.0 --threads=12 - # - name: TPC-H / TPC-DS SF30.0 Parquet local ${{ matrix.spark }} - # run: | - # cd tools/gluten-it \ - # && GLUTEN_IT_JVM_ARGS=-Xmx6G sbin/gluten-it.sh queries-compare \ - # --local --preset=velox --benchmark-type=h --error-on-memleak -s=30.0 --off-heap-size=8g --threads=12 --shuffle-partitions=72 --iterations=1 \ - # --data-gen=skip --shard=${{ matrix.shard }} \ - # && GLUTEN_IT_JVM_ARGS=-Xmx6G sbin/gluten-it.sh queries-compare \ - # --local --preset=velox --benchmark-type=ds --error-on-memleak -s=30.0 --off-heap-size=8g --threads=12 --shuffle-partitions=72 --iterations=1 \ - # --data-gen=skip --shard=${{ matrix.shard }} - run-tpc-test-centos8-uniffle: needs: build-native-lib-centos-7 strategy: @@ -480,7 +425,7 @@ jobs: - name: Download All Arrow Jar Artifacts uses: actions/download-artifact@v2 with: - name: velox-arrow-jar-centos-7-${{github.sha}} + name: arrow-jars-centos-7-${{github.sha}} path: /root/.m2/repository/org/apache/arrow/ - name: Update mirror list run: | @@ -489,22 +434,18 @@ jobs: - name: Setup java and maven run: | yum update -y && yum install -y java-1.8.0-openjdk-devel wget git - wget https://downloads.apache.org/maven/maven-3/3.8.8/binaries/apache-maven-3.8.8-bin.tar.gz - tar -xvf apache-maven-3.8.8-bin.tar.gz - mv apache-maven-3.8.8 /usr/lib/maven + $SETUP install_maven - name: Build for Uniffle 0.9.0 run: | - export MAVEN_HOME=/usr/lib/maven && \ - export PATH=${PATH}:${MAVEN_HOME}/bin && \ export JAVA_HOME=/usr/lib/jvm/java-1.8.0-openjdk && \ cd /opt && \ git clone -b v0.9.0 https://github.com/apache/incubator-uniffle.git && \ cd incubator-uniffle && \ $MVN_CMD clean install -Phadoop2.8,spark3 -DskipTests cd /opt && \ - wget -nv https://archive.apache.org/dist/incubator/uniffle/0.9.0/apache-uniffle-0.9.0-incubating-bin.tar.gz && \ + ${WGET_CMD} https://archive.apache.org/dist/incubator/uniffle/0.9.0/apache-uniffle-0.9.0-incubating-bin.tar.gz && \ tar xzf apache-uniffle-0.9.0-incubating-bin.tar.gz -C /opt/ && mv /opt/rss-0.9.0-hadoop2.8 /opt/uniffle && \ - wget -nv https://archive.apache.org/dist/hadoop/common/hadoop-2.8.5/hadoop-2.8.5.tar.gz && \ + ${WGET_CMD} https://archive.apache.org/dist/hadoop/common/hadoop-2.8.5/hadoop-2.8.5.tar.gz && \ tar xzf hadoop-2.8.5.tar.gz -C /opt/ rm -rf /opt/incubator-uniffle cd /opt/uniffle && mkdir shuffle_data && \ @@ -514,14 +455,10 @@ jobs: bash ./bin/start-coordinator.sh && bash ./bin/start-shuffle-server.sh - name: Build for Spark ${{ matrix.spark }} run: | - export MAVEN_HOME=/usr/lib/maven && \ - export PATH=${PATH}:${MAVEN_HOME}/bin && \ cd $GITHUB_WORKSPACE/ && \ $MVN_CMD clean install -P${{ matrix.spark }} -Pbackends-velox -Puniffle -DskipTests - name: TPC-H SF1.0 && TPC-DS SF1.0 Parquet local spark3.2 with uniffle-0.9.0 run: | - export MAVEN_HOME=/usr/lib/maven && \ - export PATH=${PATH}:${MAVEN_HOME}/bin && \ export JAVA_HOME=/usr/lib/jvm/java-1.8.0-openjdk && \ cd $GITHUB_WORKSPACE/tools/gluten-it && \ $MVN_CMD clean install -Pspark-3.2 -Puniffle && \ @@ -547,7 +484,7 @@ jobs: - name: Download All Arrow Jar Artifacts uses: actions/download-artifact@v2 with: - name: velox-arrow-jar-centos-7-${{github.sha}} + name: arrow-jars-centos-7-${{github.sha}} path: /root/.m2/repository/org/apache/arrow/ - name: Setup tzdata run: | @@ -572,7 +509,7 @@ jobs: fi echo "EXTRA_PROFILE: ${EXTRA_PROFILE}" cd /opt && mkdir -p celeborn && \ - wget https://archive.apache.org/dist/celeborn/${{ matrix.celeborn }}/apache-${{ matrix.celeborn }}-bin.tgz && \ + ${WGET_CMD} https://archive.apache.org/dist/celeborn/${{ matrix.celeborn }}/apache-${{ matrix.celeborn }}-bin.tgz && \ tar xzf apache-${{ matrix.celeborn }}-bin.tgz -C /opt/celeborn --strip-components=1 && cd celeborn && \ mv ./conf/celeborn-env.sh.template ./conf/celeborn-env.sh && \ bash -c "echo -e 'CELEBORN_MASTER_MEMORY=4g\nCELEBORN_WORKER_MEMORY=4g\nCELEBORN_WORKER_OFFHEAP_MEMORY=8g' > ./conf/celeborn-env.sh" && \ @@ -584,73 +521,21 @@ jobs: GLUTEN_IT_JVM_ARGS=-Xmx5G sbin/gluten-it.sh queries-compare \ --local --preset=velox-with-celeborn --benchmark-type=ds --error-on-memleak --off-heap-size=10g -s=1.0 --threads=8 --iterations=1 - build-native-lib-centos-8: - runs-on: ubuntu-20.04 - container: ghcr.io/facebookincubator/velox-dev:centos8 - steps: - - uses: actions/checkout@v2 - - name: Generate cache key - run: | - echo ${{ hashFiles('./ep/build-velox/src/**', './dev/**', './cpp/*', './github/workflows/*') }} > cache-key - - name: Cache - id: cache - uses: actions/cache/restore@v3 - with: - path: | - ./cpp/build/releases/ - ./cpp/build/velox/udf/examples/ - ./cpp/build/velox/benchmarks/ - /root/.m2/repository/org/apache/arrow/ - key: cache-velox-build-centos-8-${{ hashFiles('./cache-key') }} - - name: Build Gluten native libraries - if: steps.cache.outputs.cache-hit != 'true' - run: | - df -a - bash dev/ci-velox-buildshared-centos-8.sh - - uses: actions/upload-artifact@v2 - with: - name: velox-native-lib-centos-8-${{github.sha}} - path: ./cpp/build/releases/ - - uses: actions/upload-artifact@v2 - with: - name: udf-example-lib-centos-8-${{github.sha}} - path: ./cpp/build/velox/udf/examples/ - - uses: actions/upload-artifact@v2 - with: - name: benchmark-centos-8-${{github.sha}} - path: ./cpp/build/velox/benchmarks/ - - uses: actions/upload-artifact@v2 - with: - name: arrow-jars-centos-8-${{github.sha}} - path: /root/.m2/repository/org/apache/arrow/ - run-spark-test-spark32: - needs: build-native-lib-centos-8 + needs: build-native-lib-centos-7 runs-on: ubuntu-20.04 container: ghcr.io/facebookincubator/velox-dev:centos8 env: CCACHE_DIR: "${{ github.workspace }}/.ccache" steps: - uses: actions/checkout@v2 - - name: Download All Artifacts - uses: actions/download-artifact@v2 + - uses: actions/download-artifact@v2 with: - name: velox-native-lib-centos-8-${{github.sha}} + name: velox-native-lib-centos-7-${{github.sha}} path: ./cpp/build/releases - - name: Download UDF Example Lib - uses: actions/download-artifact@v2 + - uses: actions/download-artifact@v2 with: - name: udf-example-lib-centos-8-${{github.sha}} - path: ./cpp/build/velox/udf/examples/ - - name: Download Benchmark - uses: actions/download-artifact@v2 - with: - name: benchmark-centos-8-${{github.sha}} - path: ./cpp/build/velox/benchmarks/ - - name: Download Arrow Jars - uses: actions/download-artifact@v2 - with: - name: arrow-jars-centos-8-${{github.sha}} + name: arrow-jars-centos-7-${{github.sha}} path: /root/.m2/repository/org/apache/arrow/ - name: Update mirror list run: | @@ -659,10 +544,7 @@ jobs: - name: Setup build dependency run: | yum install sudo patch java-1.8.0-openjdk-devel wget -y - wget https://downloads.apache.org/maven/maven-3/3.8.8/binaries/apache-maven-3.8.8-bin.tar.gz - tar -xvf apache-maven-3.8.8-bin.tar.gz - mv apache-maven-3.8.8 /usr/lib/maven - echo "PATH=${PATH}:/usr/lib/maven/bin" >> $GITHUB_ENV + $SETUP install_maven - name: Get Ccache uses: actions/cache/restore@v3 with: @@ -674,17 +556,7 @@ jobs: mkdir -p '${{ env.CCACHE_DIR }}' - name: Prepare spark.test.home for Spark 3.2.2 (other tests) run: | - cd $GITHUB_WORKSPACE/ && \ - wget https://archive.apache.org/dist/spark/spark-3.2.2/spark-3.2.2-bin-hadoop3.2.tgz && \ - tar --strip-components=1 -xf spark-3.2.2-bin-hadoop3.2.tgz spark-3.2.2-bin-hadoop3.2/jars/ && \ - rm -rf spark-3.2.2-bin-hadoop3.2.tgz && \ - mkdir -p $GITHUB_WORKSPACE//shims/spark32/spark_home/assembly/target/scala-2.12 && \ - mv jars $GITHUB_WORKSPACE//shims/spark32/spark_home/assembly/target/scala-2.12 && \ - cd $GITHUB_WORKSPACE// && \ - wget https://github.com/apache/spark/archive/refs/tags/v3.2.2.tar.gz && \ - tar --strip-components=1 -xf v3.2.2.tar.gz spark-3.2.2/sql/core/src/test/resources/ && \ - mkdir -p shims/spark32/spark_home/ && \ - mv sql shims/spark32/spark_home/ && \ + bash .github/workflows/util/install_spark_resources.sh 3.2 dnf module -y install python39 && \ alternatives --set python3 /usr/bin/python3.9 && \ pip3 install setuptools && \ @@ -694,23 +566,18 @@ jobs: run: | cd $GITHUB_WORKSPACE/ export SPARK_SCALA_VERSION=2.12 - $MVN_CMD clean install -Pspark-3.2 -Pspark-ut -Pbackends-velox -Pceleborn -Piceberg -Pdelta -DargLine="-Dspark.test.home=$GITHUB_WORKSPACE//shims/spark32/spark_home/" -DtagsToExclude=org.apache.spark.tags.ExtendedSQLTest,org.apache.gluten.tags.UDFTest,org.apache.gluten.tags.SkipTestTags && \ - $MVN_CMD test -Pspark-3.2 -Pbackends-velox -Piceberg -Pdelta -DtagsToExclude=None -DtagsToInclude=org.apache.gluten.tags.UDFTest + $MVN_CMD clean test -Pspark-3.2 -Pspark-ut -Pbackends-velox -Pceleborn -Piceberg \ + -Pdelta -DargLine="-Dspark.test.home=$GITHUB_WORKSPACE//shims/spark32/spark_home/" \ + -DtagsToExclude=org.apache.spark.tags.ExtendedSQLTest,org.apache.gluten.tags.UDFTest,org.apache.gluten.tags.SkipTestTags - name: Upload golden files if: failure() uses: actions/upload-artifact@v4 with: name: golden-files-spark32 path: /tmp/tpch-approved-plan/** - - name: Gluten CPP Benchmark Test - run: | - # This test depends on example.json generated by the above mvn test. - cd $GITHUB_WORKSPACE/cpp/build/velox/benchmarks && \ - sudo chmod +x ./generic_benchmark && \ - ./generic_benchmark --run-example --with-shuffle --threads 1 --iterations 1 run-spark-test-spark32-slow: - needs: build-native-lib-centos-8 + needs: build-native-lib-centos-7 runs-on: ubuntu-20.04 container: ghcr.io/facebookincubator/velox-dev:centos8 env: @@ -720,12 +587,12 @@ jobs: - name: Download All Artifacts uses: actions/download-artifact@v2 with: - name: velox-native-lib-centos-8-${{github.sha}} + name: velox-native-lib-centos-7-${{github.sha}} path: ./cpp/build/releases - name: Download Arrow Jars uses: actions/download-artifact@v2 with: - name: arrow-jars-centos-8-${{github.sha}} + name: arrow-jars-centos-7-${{github.sha}} path: /root/.m2/repository/org/apache/arrow/ - name: Update mirror list run: | @@ -734,10 +601,7 @@ jobs: - name: Setup build dependency run: | yum install sudo patch java-1.8.0-openjdk-devel wget -y - wget https://downloads.apache.org/maven/maven-3/3.8.8/binaries/apache-maven-3.8.8-bin.tar.gz - tar -xvf apache-maven-3.8.8-bin.tar.gz - mv apache-maven-3.8.8 /usr/lib/maven - echo "PATH=${PATH}:/usr/lib/maven/bin" >> $GITHUB_ENV + $SETUP install_maven - name: Get Ccache uses: actions/cache/restore@v3 with: @@ -749,18 +613,15 @@ jobs: mkdir -p '${{ env.CCACHE_DIR }}' - name: Prepare spark.test.home for Spark 3.2.2 (slow tests) run: | - cd $GITHUB_WORKSPACE// && \ - wget https://github.com/apache/spark/archive/refs/tags/v3.2.2.tar.gz && \ - tar --strip-components=1 -xf v3.2.2.tar.gz spark-3.2.2/sql/core/src/test/resources/ && \ - mkdir -p shims/spark32/spark_home/ && \ - mv sql shims/spark32/spark_home/ + bash .github/workflows/util/install_spark_resources.sh 3.2 - name: Build and run unit test for Spark 3.2.2 (slow tests) run: | cd $GITHUB_WORKSPACE/ - $MVN_CMD clean install -Pspark-3.2 -Pspark-ut -Pbackends-velox -Pceleborn -Piceberg -Pdelta -DargLine="-Dspark.test.home=$GITHUB_WORKSPACE//shims/spark32/spark_home/" -DtagsToInclude=org.apache.spark.tags.ExtendedSQLTest + $MVN_CMD clean test -Pspark-3.2 -Pspark-ut -Pbackends-velox -Pceleborn -Piceberg -Pdelta \ + -DargLine="-Dspark.test.home=$GITHUB_WORKSPACE//shims/spark32/spark_home/" -DtagsToInclude=org.apache.spark.tags.ExtendedSQLTest run-spark-test-spark33: - needs: build-native-lib-centos-8 + needs: build-native-lib-centos-7 runs-on: ubuntu-20.04 container: ghcr.io/facebookincubator/velox-dev:centos8 env: @@ -770,17 +631,12 @@ jobs: - name: Download All Artifacts uses: actions/download-artifact@v2 with: - name: velox-native-lib-centos-8-${{github.sha}} + name: velox-native-lib-centos-7-${{github.sha}} path: ./cpp/build/releases - - name: Download UDF Example Lib - uses: actions/download-artifact@v2 - with: - name: udf-example-lib-centos-8-${{github.sha}} - path: ./cpp/build/velox/udf/examples/ - name: Download Arrow Jars uses: actions/download-artifact@v2 with: - name: arrow-jars-centos-8-${{github.sha}} + name: arrow-jars-centos-7-${{github.sha}} path: /root/.m2/repository/org/apache/arrow/ - name: Update mirror list run: | @@ -789,10 +645,7 @@ jobs: - name: Setup build dependency run: | yum install sudo patch java-1.8.0-openjdk-devel wget -y - wget https://downloads.apache.org/maven/maven-3/3.8.8/binaries/apache-maven-3.8.8-bin.tar.gz - tar -xvf apache-maven-3.8.8-bin.tar.gz - mv apache-maven-3.8.8 /usr/lib/maven - echo "PATH=${PATH}:/usr/lib/maven/bin" >> $GITHUB_ENV + $SETUP install_maven - name: Get Ccache uses: actions/cache/restore@v3 with: @@ -804,17 +657,7 @@ jobs: mkdir -p '${{ env.CCACHE_DIR }}' - name: Prepare spark.test.home for Spark 3.3.1 (other tests) run: | - cd $GITHUB_WORKSPACE/ && \ - wget https://archive.apache.org/dist/spark/spark-3.3.1/spark-3.3.1-bin-hadoop3.tgz && \ - tar --strip-components=1 -xf spark-3.3.1-bin-hadoop3.tgz spark-3.3.1-bin-hadoop3/jars/ && \ - rm -rf spark-3.3.1-bin-hadoop3.tgz && \ - mkdir -p $GITHUB_WORKSPACE//shims/spark33/spark_home/assembly/target/scala-2.12 && \ - mv jars $GITHUB_WORKSPACE//shims/spark33/spark_home/assembly/target/scala-2.12 && \ - cd $GITHUB_WORKSPACE// && \ - wget https://github.com/apache/spark/archive/refs/tags/v3.3.1.tar.gz && \ - tar --strip-components=1 -xf v3.3.1.tar.gz spark-3.3.1/sql/core/src/test/resources/ && \ - mkdir -p shims/spark33/spark_home/ && \ - mv sql shims/spark33/spark_home/ && \ + bash .github/workflows/util/install_spark_resources.sh 3.3 dnf module -y install python39 && \ alternatives --set python3 /usr/bin/python3.9 && \ pip3 install setuptools && \ @@ -824,8 +667,9 @@ jobs: run: | cd $GITHUB_WORKSPACE/ export SPARK_SCALA_VERSION=2.12 - $MVN_CMD clean install -Pspark-3.3 -Pbackends-velox -Pceleborn -Piceberg -Pdelta -Pspark-ut -DargLine="-Dspark.test.home=$GITHUB_WORKSPACE//shims/spark33/spark_home/" -DtagsToExclude=org.apache.spark.tags.ExtendedSQLTest,org.apache.gluten.tags.UDFTest,org.apache.gluten.tags.SkipTestTags && \ - $MVN_CMD test -Pspark-3.3 -Pbackends-velox -Piceberg -Pdelta -DtagsToExclude=None -DtagsToInclude=org.apache.gluten.tags.UDFTest + $MVN_CMD clean test -Pspark-3.3 -Pbackends-velox -Pceleborn -Piceberg -Pdelta -Pspark-ut \ + -DargLine="-Dspark.test.home=$GITHUB_WORKSPACE//shims/spark33/spark_home/" \ + -DtagsToExclude=org.apache.spark.tags.ExtendedSQLTest,org.apache.gluten.tags.UDFTest,org.apache.gluten.tags.SkipTestTags - name: Upload golden files if: failure() uses: actions/upload-artifact@v4 @@ -835,7 +679,7 @@ jobs: run-spark-test-spark33-slow: - needs: build-native-lib-centos-8 + needs: build-native-lib-centos-7 runs-on: ubuntu-20.04 container: ghcr.io/facebookincubator/velox-dev:centos8 env: @@ -845,12 +689,12 @@ jobs: - name: Download All Artifacts uses: actions/download-artifact@v2 with: - name: velox-native-lib-centos-8-${{github.sha}} + name: velox-native-lib-centos-7-${{github.sha}} path: ./cpp/build/releases - name: Download Arrow Jars uses: actions/download-artifact@v2 with: - name: arrow-jars-centos-8-${{github.sha}} + name: arrow-jars-centos-7-${{github.sha}} path: /root/.m2/repository/org/apache/arrow/ - name: Update mirror list run: | @@ -859,10 +703,7 @@ jobs: - name: Setup build dependency run: | yum install sudo patch java-1.8.0-openjdk-devel wget -y - wget https://downloads.apache.org/maven/maven-3/3.8.8/binaries/apache-maven-3.8.8-bin.tar.gz - tar -xvf apache-maven-3.8.8-bin.tar.gz - mv apache-maven-3.8.8 /usr/lib/maven - echo "PATH=${PATH}:/usr/lib/maven/bin" >> $GITHUB_ENV + $SETUP install_maven - name: Get Ccache uses: actions/cache/restore@v3 with: @@ -874,18 +715,16 @@ jobs: mkdir -p '${{ env.CCACHE_DIR }}' - name: Prepare spark.test.home for Spark 3.3.1 (slow tests) run: | - cd $GITHUB_WORKSPACE// && \ - wget https://github.com/apache/spark/archive/refs/tags/v3.3.1.tar.gz && \ - tar --strip-components=1 -xf v3.3.1.tar.gz spark-3.3.1/sql/core/src/test/resources/ && \ - mkdir -p shims/spark33/spark_home/ && \ - mv sql shims/spark33/spark_home/ + bash .github/workflows/util/install_spark_resources.sh 3.3 - name: Build and Run unit test for Spark 3.3.1 (slow tests) run: | cd $GITHUB_WORKSPACE/ - $MVN_CMD clean install -Pspark-3.3 -Pbackends-velox -Pceleborn -Piceberg -Pdelta -Pspark-ut -DargLine="-Dspark.test.home=$GITHUB_WORKSPACE//shims/spark33/spark_home/" -DtagsToInclude=org.apache.spark.tags.ExtendedSQLTest + $MVN_CMD clean test -Pspark-3.3 -Pbackends-velox -Pceleborn -Piceberg -Pdelta -Pspark-ut \ + -DargLine="-Dspark.test.home=$GITHUB_WORKSPACE//shims/spark33/spark_home/" \ + -DtagsToInclude=org.apache.spark.tags.ExtendedSQLTest run-spark-test-spark34: - needs: build-native-lib-centos-8 + needs: build-native-lib-centos-7 runs-on: ubuntu-20.04 container: ghcr.io/facebookincubator/velox-dev:centos8 env: @@ -895,17 +734,12 @@ jobs: - name: Download All Artifacts uses: actions/download-artifact@v2 with: - name: velox-native-lib-centos-8-${{github.sha}} + name: velox-native-lib-centos-7-${{github.sha}} path: ./cpp/build/releases - - name: Download UDF Example Lib - uses: actions/download-artifact@v2 - with: - name: udf-example-lib-centos-8-${{github.sha}} - path: ./cpp/build/velox/udf/examples/ - name: Download Arrow Jars uses: actions/download-artifact@v2 with: - name: arrow-jars-centos-8-${{github.sha}} + name: arrow-jars-centos-7-${{github.sha}} path: /root/.m2/repository/org/apache/arrow/ - name: Update mirror list run: | @@ -914,10 +748,7 @@ jobs: - name: Setup build dependency run: | yum install sudo patch java-1.8.0-openjdk-devel wget -y - wget https://downloads.apache.org/maven/maven-3/3.8.8/binaries/apache-maven-3.8.8-bin.tar.gz - tar -xvf apache-maven-3.8.8-bin.tar.gz - mv apache-maven-3.8.8 /usr/lib/maven - echo "PATH=${PATH}:/usr/lib/maven/bin" >> $GITHUB_ENV + $SETUP install_maven - name: Get Ccache uses: actions/cache/restore@v3 with: @@ -929,17 +760,7 @@ jobs: mkdir -p '${{ env.CCACHE_DIR }}' - name: Prepare spark.test.home for Spark 3.4.2 (other tests) run: | - cd $GITHUB_WORKSPACE/ && \ - wget https://archive.apache.org/dist/spark/spark-3.4.2/spark-3.4.2-bin-hadoop3.tgz && \ - tar --strip-components=1 -xf spark-3.4.2-bin-hadoop3.tgz spark-3.4.2-bin-hadoop3/jars/ && \ - rm -rf spark-3.4.2-bin-hadoop3.tgz && \ - mkdir -p $GITHUB_WORKSPACE//shims/spark34/spark_home/assembly/target/scala-2.12 && \ - mv jars $GITHUB_WORKSPACE//shims/spark34/spark_home/assembly/target/scala-2.12 && \ - cd $GITHUB_WORKSPACE// && \ - wget https://github.com/apache/spark/archive/refs/tags/v3.4.2.tar.gz && \ - tar --strip-components=1 -xf v3.4.2.tar.gz spark-3.4.2/sql/core/src/test/resources/ && \ - mkdir -p shims/spark34/spark_home/ && \ - mv sql shims/spark34/spark_home/ && \ + bash .github/workflows/util/install_spark_resources.sh 3.4 dnf module -y install python39 && \ alternatives --set python3 /usr/bin/python3.9 && \ pip3 install setuptools && \ @@ -949,8 +770,9 @@ jobs: run: | cd $GITHUB_WORKSPACE/ export SPARK_SCALA_VERSION=2.12 - $MVN_CMD clean install -Pspark-3.4 -Pbackends-velox -Pceleborn -Piceberg -Pdelta -Pspark-ut -DargLine="-Dspark.test.home=$GITHUB_WORKSPACE//shims/spark34/spark_home/" -DtagsToExclude=org.apache.spark.tags.ExtendedSQLTest,org.apache.gluten.tags.UDFTest,org.apache.gluten.tags.SkipTestTags && \ - $MVN_CMD test -Pspark-3.4 -Pbackends-velox -Piceberg -Pdelta -DtagsToExclude=None -DtagsToInclude=org.apache.gluten.tags.UDFTest + $MVN_CMD clean test -Pspark-3.4 -Pbackends-velox -Pceleborn -Piceberg -Pdelta -Pspark-ut \ + -DargLine="-Dspark.test.home=$GITHUB_WORKSPACE//shims/spark34/spark_home/" \ + -DtagsToExclude=org.apache.spark.tags.ExtendedSQLTest,org.apache.gluten.tags.UDFTest,org.apache.gluten.tags.SkipTestTags - name: Upload golden files if: failure() uses: actions/upload-artifact@v4 @@ -960,7 +782,7 @@ jobs: run-spark-test-spark34-slow: - needs: build-native-lib-centos-8 + needs: build-native-lib-centos-7 runs-on: ubuntu-20.04 container: ghcr.io/facebookincubator/velox-dev:centos8 env: @@ -970,12 +792,12 @@ jobs: - name: Download All Artifacts uses: actions/download-artifact@v2 with: - name: velox-native-lib-centos-8-${{github.sha}} + name: velox-native-lib-centos-7-${{github.sha}} path: ./cpp/build/releases - name: Download Arrow Jars uses: actions/download-artifact@v2 with: - name: arrow-jars-centos-8-${{github.sha}} + name: arrow-jars-centos-7-${{github.sha}} path: /root/.m2/repository/org/apache/arrow/ - name: Update mirror list run: | @@ -984,10 +806,7 @@ jobs: - name: Setup build dependency run: | yum install sudo patch java-1.8.0-openjdk-devel wget -y - wget https://downloads.apache.org/maven/maven-3/3.8.8/binaries/apache-maven-3.8.8-bin.tar.gz - tar -xvf apache-maven-3.8.8-bin.tar.gz - mv apache-maven-3.8.8 /usr/lib/maven - echo "PATH=${PATH}:/usr/lib/maven/bin" >> $GITHUB_ENV + $SETUP install_maven - name: Get Ccache uses: actions/cache/restore@v3 with: @@ -999,18 +818,16 @@ jobs: mkdir -p '${{ env.CCACHE_DIR }}' - name: Prepare spark.test.home for Spark 3.4.2 (slow tests) run: | - cd $GITHUB_WORKSPACE// && \ - wget https://github.com/apache/spark/archive/refs/tags/v3.4.2.tar.gz && \ - tar --strip-components=1 -xf v3.4.2.tar.gz spark-3.4.2/sql/core/src/test/resources/ && \ - mkdir -p shims/spark34/spark_home/ && \ - mv sql shims/spark34/spark_home/ + bash .github/workflows/util/install_spark_resources.sh 3.4 - name: Build and Run unit test for Spark 3.4.2 (slow tests) run: | cd $GITHUB_WORKSPACE/ - $MVN_CMD clean install -Pspark-3.4 -Pbackends-velox -Pceleborn -Piceberg -Pdelta -Pspark-ut -DargLine="-Dspark.test.home=$GITHUB_WORKSPACE//shims/spark34/spark_home/" -DtagsToInclude=org.apache.spark.tags.ExtendedSQLTest + $MVN_CMD clean test -Pspark-3.4 -Pbackends-velox -Pceleborn -Piceberg -Pdelta -Pspark-ut \ + -DargLine="-Dspark.test.home=$GITHUB_WORKSPACE//shims/spark34/spark_home/" \ + -DtagsToInclude=org.apache.spark.tags.ExtendedSQLTest run-spark-test-spark35: - needs: build-native-lib-centos-8 + needs: build-native-lib-centos-7 runs-on: ubuntu-20.04 container: ghcr.io/facebookincubator/velox-dev:centos8 env: @@ -1020,17 +837,12 @@ jobs: - name: Download All Artifacts uses: actions/download-artifact@v2 with: - name: velox-native-lib-centos-8-${{github.sha}} + name: velox-native-lib-centos-7-${{github.sha}} path: ./cpp/build/releases - - name: Download UDF Example Lib - uses: actions/download-artifact@v2 - with: - name: udf-example-lib-centos-8-${{github.sha}} - path: ./cpp/build/velox/udf/examples/ - name: Download Arrow Jars uses: actions/download-artifact@v2 with: - name: arrow-jars-centos-8-${{github.sha}} + name: arrow-jars-centos-7-${{github.sha}} path: /root/.m2/repository/org/apache/arrow/ - name: Update mirror list run: | @@ -1039,10 +851,7 @@ jobs: - name: Setup build dependency run: | yum install sudo patch java-1.8.0-openjdk-devel wget -y - wget https://downloads.apache.org/maven/maven-3/3.8.8/binaries/apache-maven-3.8.8-bin.tar.gz - tar -xvf apache-maven-3.8.8-bin.tar.gz - mv apache-maven-3.8.8 /usr/lib/maven - echo "PATH=${PATH}:/usr/lib/maven/bin" >> $GITHUB_ENV + $SETUP install_maven - name: Get Ccache uses: actions/cache/restore@v3 with: @@ -1054,17 +863,7 @@ jobs: mkdir -p '${{ env.CCACHE_DIR }}' - name: Prepare spark.test.home for Spark 3.5.1 (other tests) run: | - cd $GITHUB_WORKSPACE/ && \ - wget https://archive.apache.org/dist/spark/spark-3.5.1/spark-3.5.1-bin-hadoop3.tgz && \ - tar --strip-components=1 -xf spark-3.5.1-bin-hadoop3.tgz spark-3.5.1-bin-hadoop3/jars/ && \ - rm -rf spark-3.5.1-bin-hadoop3.tgz && \ - mkdir -p $GITHUB_WORKSPACE//shims/spark35/spark_home/assembly/target/scala-2.12 && \ - mv jars $GITHUB_WORKSPACE//shims/spark35/spark_home/assembly/target/scala-2.12 && \ - cd $GITHUB_WORKSPACE// && \ - wget https://github.com/apache/spark/archive/refs/tags/v3.5.1.tar.gz && \ - tar --strip-components=1 -xf v3.5.1.tar.gz spark-3.5.1/sql/core/src/test/resources/ && \ - mkdir -p shims/spark35/spark_home/ && \ - mv sql shims/spark35/spark_home/ && \ + bash .github/workflows/util/install_spark_resources.sh 3.5 dnf module -y install python39 && \ alternatives --set python3 /usr/bin/python3.9 && \ pip3 install setuptools && \ @@ -1074,8 +873,9 @@ jobs: run: | cd $GITHUB_WORKSPACE/ export SPARK_SCALA_VERSION=2.12 - $MVN_CMD clean install -Pspark-3.5 -Pbackends-velox -Pceleborn -Piceberg -Pdelta -Pspark-ut -DargLine="-Dspark.test.home=$GITHUB_WORKSPACE//shims/spark35/spark_home/" -DtagsToExclude=org.apache.spark.tags.ExtendedSQLTest,org.apache.gluten.tags.UDFTest,org.apache.gluten.tags.SkipTestTags && \ - $MVN_CMD test -Pspark-3.5 -Pbackends-velox -Piceberg -Pdelta -DtagsToExclude=None -DtagsToInclude=org.apache.gluten.tags.UDFTest + $MVN_CMD clean test -Pspark-3.5 -Pbackends-velox -Pceleborn -Piceberg -Pdelta -Pspark-ut \ + -DargLine="-Dspark.test.home=$GITHUB_WORKSPACE//shims/spark35/spark_home/" \ + -DtagsToExclude=org.apache.spark.tags.ExtendedSQLTest,org.apache.gluten.tags.UDFTest,org.apache.gluten.tags.SkipTestTags - name: Upload golden files if: failure() uses: actions/upload-artifact@v4 @@ -1083,8 +883,59 @@ jobs: name: golden-files-spark35 path: /tmp/tpch-approved-plan/** + run-spark-test-spark35-scala213: + needs: build-native-lib-centos-7 + runs-on: ubuntu-20.04 + container: ghcr.io/facebookincubator/velox-dev:centos8 + env: + CCACHE_DIR: "${{ github.workspace }}/.ccache" + steps: + - uses: actions/checkout@v2 + - name: Download All Artifacts + uses: actions/download-artifact@v2 + with: + name: velox-native-lib-centos-7-${{github.sha}} + path: ./cpp/build/releases + - name: Download Arrow Jars + uses: actions/download-artifact@v2 + with: + name: arrow-jars-centos-7-${{github.sha}} + path: /root/.m2/repository/org/apache/arrow/ + - name: Update mirror list + run: | + sed -i -e "s|mirrorlist=|#mirrorlist=|g" /etc/yum.repos.d/CentOS-* || true + sed -i -e "s|#baseurl=http://mirror.centos.org|baseurl=http://vault.centos.org|g" /etc/yum.repos.d/CentOS-* || true + - name: Setup build dependency + run: | + yum install sudo patch java-1.8.0-openjdk-devel wget -y + $SETUP install_maven + - name: Get Ccache + uses: actions/cache/restore@v3 + with: + path: '${{ env.CCACHE_DIR }}' + key: ccache-centos-release-default + - name: Ensure Cache Dirs Exists + working-directory: ${{ github.workspace }} + run: | + mkdir -p '${{ env.CCACHE_DIR }}' + - name: Prepare spark.test.home for Spark 3.5.1 (other tests) + run: | + bash .github/workflows/util/install_spark_resources.sh 3.5-scala2.13 + dnf module -y install python39 && \ + alternatives --set python3 /usr/bin/python3.9 && \ + pip3 install setuptools && \ + pip3 install pyspark==3.5.1 cython && \ + pip3 install pandas pyarrow + - name: Build and Run unit test for Spark 3.5.1 with scala-2.13 (other tests) + run: | + cd $GITHUB_WORKSPACE/ + export SPARK_SCALA_VERSION=2.13 + $MVN_CMD clean test -Pspark-3.5 -Pscala-2.13 -Pbackends-velox -Pceleborn -Piceberg \ + -Pdelta -Pspark-ut -DargLine="-Dspark.test.home=$GITHUB_WORKSPACE//shims/spark35/spark_home/" \ + -DtagsToExclude=org.apache.spark.tags.ExtendedSQLTest,org.apache.gluten.tags.UDFTest,org.apache.gluten.tags.SkipTestTags + run-spark-test-spark35-slow: - needs: build-native-lib-centos-8 + needs: build-native-lib-centos-7 runs-on: ubuntu-20.04 container: ghcr.io/facebookincubator/velox-dev:centos8 env: @@ -1094,12 +945,12 @@ jobs: - name: Download All Artifacts uses: actions/download-artifact@v2 with: - name: velox-native-lib-centos-8-${{github.sha}} + name: velox-native-lib-centos-7-${{github.sha}} path: ./cpp/build/releases - name: Download Arrow Jars uses: actions/download-artifact@v2 with: - name: arrow-jars-centos-8-${{github.sha}} + name: arrow-jars-centos-7-${{github.sha}} path: /root/.m2/repository/org/apache/arrow/ - name: Update mirror list run: | @@ -1108,10 +959,7 @@ jobs: - name: Setup build dependency run: | yum install sudo patch java-1.8.0-openjdk-devel wget -y - wget https://downloads.apache.org/maven/maven-3/3.8.8/binaries/apache-maven-3.8.8-bin.tar.gz - tar -xvf apache-maven-3.8.8-bin.tar.gz - mv apache-maven-3.8.8 /usr/lib/maven - echo "PATH=${PATH}:/usr/lib/maven/bin" >> $GITHUB_ENV + $SETUP install_maven - name: Get Ccache uses: actions/cache/restore@v3 with: @@ -1123,18 +971,55 @@ jobs: mkdir -p '${{ env.CCACHE_DIR }}' - name: Prepare spark.test.home for Spark 3.5.1 (other tests) run: | - cd $GITHUB_WORKSPACE/ && \ - wget https://archive.apache.org/dist/spark/spark-3.5.1/spark-3.5.1-bin-hadoop3.tgz && \ - tar --strip-components=1 -xf spark-3.5.1-bin-hadoop3.tgz spark-3.5.1-bin-hadoop3/jars/ && \ - rm -rf spark-3.5.1-bin-hadoop3.tgz && \ - mkdir -p $GITHUB_WORKSPACE//shims/spark35/spark_home/assembly/target/scala-2.12 && \ - mv jars $GITHUB_WORKSPACE//shims/spark35/spark_home/assembly/target/scala-2.12 && \ - cd $GITHUB_WORKSPACE// && \ - wget https://github.com/apache/spark/archive/refs/tags/v3.5.1.tar.gz && \ - tar --strip-components=1 -xf v3.5.1.tar.gz spark-3.5.1/sql/core/src/test/resources/ && \ - mkdir -p shims/spark35/spark_home/ && \ - mv sql shims/spark35/spark_home/ + bash .github/workflows/util/install_spark_resources.sh 3.5 - name: Build and Run unit test for Spark 3.5.1 (slow tests) run: | cd $GITHUB_WORKSPACE/ - $MVN_CMD clean install -Pspark-3.5 -Pbackends-velox -Pceleborn -Piceberg -Pdelta -Pspark-ut -DargLine="-Dspark.test.home=$GITHUB_WORKSPACE//shims/spark35/spark_home/" -DtagsToInclude=org.apache.spark.tags.ExtendedSQLTest + $MVN_CMD clean test -Pspark-3.5 -Pbackends-velox -Pceleborn -Piceberg -Pdelta -Pspark-ut \ + -DargLine="-Dspark.test.home=$GITHUB_WORKSPACE//shims/spark35/spark_home/" \ + -DtagsToInclude=org.apache.spark.tags.ExtendedSQLTest + + run-cpp-test-udf-test: + runs-on: ubuntu-20.04 + container: ghcr.io/facebookincubator/velox-dev:centos8 + steps: + - uses: actions/checkout@v2 + - name: Generate cache key + run: | + echo ${{ hashFiles('./ep/build-velox/src/**', './dev/**', './cpp/*', './.github/workflows/*') }} > cache-key + - name: Cache + id: cache + uses: actions/cache/restore@v3 + with: + path: | + ./cpp/build/releases/ + ./cpp/build/velox/udf/examples/ + ./cpp/build/velox/benchmarks/ + /root/.m2/repository/org/apache/arrow/ + key: cache-velox-build-centos-8-${{ hashFiles('./cache-key') }} + - name: Setup java and maven + run: | + sed -i -e "s|mirrorlist=|#mirrorlist=|g" /etc/yum.repos.d/CentOS-* || true + sed -i -e "s|#baseurl=http://mirror.centos.org|baseurl=http://vault.centos.org|g" /etc/yum.repos.d/CentOS-* || true + yum install sudo patch java-1.8.0-openjdk-devel wget -y + $SETUP install_maven + - name: Build Gluten native libraries + if: steps.cache.outputs.cache-hit != 'true' + run: | + df -a + bash dev/ci-velox-buildshared-centos-8.sh + - name: Run CPP unit test + run: | + cd ./cpp/build && ctest -V + - name: Run CPP benchmark test + run: | + $MVN_CMD test -Pspark-3.5 -Pbackends-velox -pl backends-velox -am \ + -DtagsToInclude="org.apache.gluten.tags.GenerateExample" -Dtest=none -DfailIfNoTests=false -Dexec.skip + # This test depends on example.json generated by the above mvn test. + cd cpp/build/velox/benchmarks && sudo chmod +x ./generic_benchmark + ./generic_benchmark --run-example --with-shuffle --threads 1 --iterations 1 + - name: Run UDF test + run: | + # Depends on --build_example=ON. + $MVN_CMD test -Pspark-3.5 -Pbackends-velox -Piceberg -Pdelta -DtagsToExclude=None \ + -DtagsToInclude=org.apache.gluten.tags.UDFTest diff --git a/.github/workflows/velox_docker_cache.yml b/.github/workflows/velox_backend_cache.yml similarity index 90% rename from .github/workflows/velox_docker_cache.yml rename to .github/workflows/velox_backend_cache.yml index 3fc0fc50f8473..a25eda9367d62 100644 --- a/.github/workflows/velox_docker_cache.yml +++ b/.github/workflows/velox_backend_cache.yml @@ -30,12 +30,12 @@ concurrency: jobs: cache-native-lib-centos-7: runs-on: ubuntu-20.04 - container: apache/gluten:gluten-vcpkg-builder_2024_08_05 # centos7 with dependencies installed + container: apache/gluten:vcpkg-centos-7 steps: - uses: actions/checkout@v2 - name: Generate cache key run: | - echo ${{ hashFiles('./ep/build-velox/src/**', './dev/**', './cpp/*', './github/workflows/*') }} > cache-key + echo ${{ hashFiles('./ep/build-velox/src/**', './dev/**', './cpp/*', './.github/workflows/*') }} > cache-key - name: Check existing caches id: check-cache uses: actions/cache/restore@v3 @@ -43,7 +43,6 @@ jobs: lookup-only: true path: | ./cpp/build/releases/ - /root/.m2/repository/org/apache/arrow/ key: cache-velox-build-centos-7-${{ hashFiles('./cache-key') }} - name: Build Gluten native libraries if: steps.check-cache.outputs.cache-hit != 'true' @@ -57,7 +56,6 @@ jobs: with: path: | ./cpp/build/releases/ - /root/.m2/repository/org/apache/arrow/ key: cache-velox-build-centos-7-${{ hashFiles('./cache-key') }} cache-native-lib-centos-8: @@ -67,7 +65,7 @@ jobs: - uses: actions/checkout@v2 - name: Generate cache key run: | - echo ${{ hashFiles('./ep/build-velox/src/**', './dev/**', './cpp/*', './github/workflows/*') }} > cache-key + echo ${{ hashFiles('./ep/build-velox/src/**', './dev/**', './cpp/*', './.github/workflows/*') }} > cache-key - name: Check existing caches id: check-cache uses: actions/cache/restore@v3 @@ -79,6 +77,13 @@ jobs: ./cpp/build/velox/benchmarks/ /root/.m2/repository/org/apache/arrow/ key: cache-velox-build-centos-8-${{ hashFiles('./cache-key') }} + - name: Setup java and maven + if: steps.check-cache.outputs.cache-hit != 'true' + run: | + sed -i -e "s|mirrorlist=|#mirrorlist=|g" /etc/yum.repos.d/CentOS-* || true + sed -i -e "s|#baseurl=http://mirror.centos.org|baseurl=http://vault.centos.org|g" /etc/yum.repos.d/CentOS-* || true + yum install sudo patch java-1.8.0-openjdk-devel wget -y + bash .github/workflows/util/setup_helper.sh install_maven - name: Build Gluten native libraries if: steps.check-cache.outputs.cache-hit != 'true' run: | diff --git a/.scalafmt.conf b/.scalafmt.conf index e65c0217fc584..937ab11383e31 100644 --- a/.scalafmt.conf +++ b/.scalafmt.conf @@ -1,7 +1,7 @@ runner.dialect = scala212 # Version is required to make sure IntelliJ picks the right version -version = 3.5.9 +version = 3.8.3 preset = default # Max column diff --git a/backends-clickhouse/src/main/antlr4/org/apache/gluten/sql/parser/GlutenCacheFileSqlBase.g4 b/backends-clickhouse/src/main/antlr4/org/apache/gluten/sql/parser/GlutenCacheFileSqlBase.g4 new file mode 100644 index 0000000000000..abdb0cdbf81ef --- /dev/null +++ b/backends-clickhouse/src/main/antlr4/org/apache/gluten/sql/parser/GlutenCacheFileSqlBase.g4 @@ -0,0 +1,224 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +grammar GlutenCacheFileSqlBase; + +@members { + /** + * Verify whether current token is a valid decimal token (which contains dot). + * Returns true if the character that follows the token is not a digit or letter or underscore. + * + * For example: + * For char stream "2.3", "2." is not a valid decimal token, because it is followed by digit '3'. + * For char stream "2.3_", "2.3" is not a valid decimal token, because it is followed by '_'. + * For char stream "2.3W", "2.3" is not a valid decimal token, because it is followed by 'W'. + * For char stream "12.0D 34.E2+0.12 " 12.0D is a valid decimal token because it is folllowed + * by a space. 34.E2 is a valid decimal token because it is followed by symbol '+' + * which is not a digit or letter or underscore. + */ + public boolean isValidDecimal() { + int nextChar = _input.LA(1); + if (nextChar >= 'A' && nextChar <= 'Z' || nextChar >= '0' && nextChar <= '9' || + nextChar == '_') { + return false; + } else { + return true; + } + } +} + +tokens { + DELIMITER +} + +singleStatement + : statement ';'* EOF + ; + +statement + : CACHE FILES ASYNC? SELECT selectedColumns=selectedColumnNames + FROM (path=STRING) + (CACHEPROPERTIES cacheProps=propertyList)? #cacheFiles + | .*? #passThrough + ; + +selectedColumnNames + : ASTERISK + | identifier (COMMA identifier)* + ; + +propertyList + : LEFT_PAREN property (COMMA property)* RIGHT_PAREN + ; + +property + : key=propertyKey (EQ? value=propertyValue)? + ; + +propertyKey + : identifier (DOT identifier)* + | stringLit + ; + +propertyValue + : INTEGER_VALUE + | DECIMAL_VALUE + | booleanValue + | identifier LEFT_PAREN stringLit COMMA stringLit RIGHT_PAREN + | value=stringLit + ; + +stringLit + : STRING + | DOUBLEQUOTED_STRING + ; + +booleanValue + : TRUE | FALSE + ; + +identifier + : IDENTIFIER #unquotedIdentifier + | quotedIdentifier #quotedIdentifierAlternative + | nonReserved #unquotedIdentifier + ; + +quotedIdentifier + : BACKQUOTED_IDENTIFIER + ; + +// Add keywords here so that people's queries don't break if they have a column name as one of +// these tokens +nonReserved + : CACHE | FILES | ASYNC + | SELECT | FOR | AFTER | CACHEPROPERTIES + | TIMESTAMP | AS | OF | DATE_PARTITION + | + ; + +// Define how the keywords above should appear in a user's SQL statement. +CACHE: 'CACHE'; +META: 'META'; +ASYNC: 'ASYNC'; +SELECT: 'SELECT'; +COMMA: ','; +FOR: 'FOR'; +FROM: 'FROM'; +AFTER: 'AFTER'; +CACHEPROPERTIES: 'CACHEPROPERTIES'; +DOT: '.'; +ASTERISK: '*'; +TIMESTAMP: 'TIMESTAMP'; +AS: 'AS'; +OF: 'OF'; +DATE_PARTITION: 'DATE_PARTITION'; +LEFT_PAREN: '('; +RIGHT_PAREN: ')'; +TRUE: 'TRUE'; +FALSE: 'FALSE'; +FILES: 'FILES'; + +EQ : '=' | '=='; +NSEQ: '<=>'; +NEQ : '<>'; +NEQJ: '!='; +LTE : '<=' | '!>'; +GTE : '>=' | '!<'; +CONCAT_PIPE: '||'; + +STRING + : '\'' ( ~('\''|'\\') | ('\\' .) )* '\'' + | '"' ( ~('"'|'\\') | ('\\' .) )* '"' + ; + +DOUBLEQUOTED_STRING + :'"' ( ~('"'|'\\') | ('\\' .) )* '"' + ; + +BIGINT_LITERAL + : DIGIT+ 'L' + ; + +SMALLINT_LITERAL + : DIGIT+ 'S' + ; + +TINYINT_LITERAL + : DIGIT+ 'Y' + ; + +INTEGER_VALUE + : DIGIT+ + ; + +DECIMAL_VALUE + : DIGIT+ EXPONENT + | DECIMAL_DIGITS EXPONENT? {isValidDecimal()}? + ; + +DOUBLE_LITERAL + : DIGIT+ EXPONENT? 'D' + | DECIMAL_DIGITS EXPONENT? 'D' {isValidDecimal()}? + ; + +BIGDECIMAL_LITERAL + : DIGIT+ EXPONENT? 'BD' + | DECIMAL_DIGITS EXPONENT? 'BD' {isValidDecimal()}? + ; + +IDENTIFIER + : (LETTER | DIGIT | '_')+ + ; + +BACKQUOTED_IDENTIFIER + : '`' ( ~'`' | '``' )* '`' + ; + +fragment DECIMAL_DIGITS + : DIGIT+ '.' DIGIT* + | '.' DIGIT+ + ; + +fragment EXPONENT + : 'E' [+-]? DIGIT+ + ; + +fragment DIGIT + : [0-9] + ; + +fragment LETTER + : [A-Z] + ; + +SIMPLE_COMMENT + : '--' ~[\r\n]* '\r'? '\n'? -> channel(HIDDEN) + ; + +BRACKETED_COMMENT + : '/*' .*? '*/' -> channel(HIDDEN) + ; + +WS : [ \r\n\t]+ -> channel(HIDDEN) + ; + +// Catch-all for anything we can't recognize. +// We use this to be able to ignore and recover all the text +// when splitting statements with DelimiterLexer +UNRECOGNIZED + : . + ; diff --git a/backends-clickhouse/src/main/delta-20/org/apache/gluten/parser/GlutenCacheFilesSqlParser.scala b/backends-clickhouse/src/main/delta-20/org/apache/gluten/parser/GlutenCacheFilesSqlParser.scala new file mode 100644 index 0000000000000..48d26498f1214 --- /dev/null +++ b/backends-clickhouse/src/main/delta-20/org/apache/gluten/parser/GlutenCacheFilesSqlParser.scala @@ -0,0 +1,61 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.gluten.parser + +import org.apache.spark.sql.SparkSession +import org.apache.spark.sql.catalyst.{FunctionIdentifier, TableIdentifier} +import org.apache.spark.sql.catalyst.expressions.Expression +import org.apache.spark.sql.catalyst.parser.ParserInterface +import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan +import org.apache.spark.sql.types.{DataType, StructType} + +class GlutenCacheFilesSqlParser(spark: SparkSession, delegate: ParserInterface) + extends GlutenCacheFileSqlParserBase { + + override def parsePlan(sqlText: String): LogicalPlan = + parse(sqlText) { + parser => + astBuilder.visit(parser.singleStatement()) match { + case plan: LogicalPlan => plan + case _ => delegate.parsePlan(sqlText) + } + } + + override def parseExpression(sqlText: String): Expression = { + delegate.parseExpression(sqlText) + } + + override def parseTableIdentifier(sqlText: String): TableIdentifier = { + delegate.parseTableIdentifier(sqlText) + } + + override def parseFunctionIdentifier(sqlText: String): FunctionIdentifier = { + delegate.parseFunctionIdentifier(sqlText) + } + + override def parseMultipartIdentifier(sqlText: String): Seq[String] = { + delegate.parseMultipartIdentifier(sqlText) + } + + override def parseTableSchema(sqlText: String): StructType = { + delegate.parseTableSchema(sqlText) + } + + override def parseDataType(sqlText: String): DataType = { + delegate.parseDataType(sqlText) + } +} diff --git a/backends-clickhouse/src/main/delta-20/org/apache/spark/sql/execution/CHDeltaColumnarWrite.scala b/backends-clickhouse/src/main/delta-20/org/apache/spark/sql/execution/CHDeltaColumnarWrite.scala new file mode 100644 index 0000000000000..e949bebf236a6 --- /dev/null +++ b/backends-clickhouse/src/main/delta-20/org/apache/spark/sql/execution/CHDeltaColumnarWrite.scala @@ -0,0 +1,30 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.sql.execution + +import org.apache.gluten.exception.GlutenNotSupportException + +import org.apache.spark.internal.io.FileCommitProtocol +import org.apache.spark.sql.execution.datasources.WriteJobDescription + +object CHDeltaColumnarWrite { + def apply( + jobTrackerID: String, + description: WriteJobDescription, + committer: FileCommitProtocol): CHColumnarWrite[FileCommitProtocol] = + throw new GlutenNotSupportException("Delta Native is not supported in Spark 3.2") +} diff --git a/backends-clickhouse/src/main/delta-23/org/apache/gluten/parser/GlutenCacheFilesSqlParser.scala b/backends-clickhouse/src/main/delta-23/org/apache/gluten/parser/GlutenCacheFilesSqlParser.scala new file mode 100644 index 0000000000000..9a0cde7728434 --- /dev/null +++ b/backends-clickhouse/src/main/delta-23/org/apache/gluten/parser/GlutenCacheFilesSqlParser.scala @@ -0,0 +1,65 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.gluten.parser + +import org.apache.spark.sql.SparkSession +import org.apache.spark.sql.catalyst.{FunctionIdentifier, TableIdentifier} +import org.apache.spark.sql.catalyst.expressions.Expression +import org.apache.spark.sql.catalyst.parser.ParserInterface +import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan +import org.apache.spark.sql.types.{DataType, StructType} + +class GlutenCacheFilesSqlParser(spark: SparkSession, delegate: ParserInterface) + extends GlutenCacheFileSqlParserBase { + + override def parsePlan(sqlText: String): LogicalPlan = + parse(sqlText) { + parser => + astBuilder.visit(parser.singleStatement()) match { + case plan: LogicalPlan => plan + case _ => delegate.parsePlan(sqlText) + } + } + + override def parseExpression(sqlText: String): Expression = { + delegate.parseExpression(sqlText) + } + + override def parseTableIdentifier(sqlText: String): TableIdentifier = { + delegate.parseTableIdentifier(sqlText) + } + + override def parseFunctionIdentifier(sqlText: String): FunctionIdentifier = { + delegate.parseFunctionIdentifier(sqlText) + } + + override def parseMultipartIdentifier(sqlText: String): Seq[String] = { + delegate.parseMultipartIdentifier(sqlText) + } + + override def parseTableSchema(sqlText: String): StructType = { + delegate.parseTableSchema(sqlText) + } + + override def parseDataType(sqlText: String): DataType = { + delegate.parseDataType(sqlText) + } + + override def parseQuery(sqlText: String): LogicalPlan = { + delegate.parseQuery(sqlText) + } +} diff --git a/backends-clickhouse/src/main/delta-23/org/apache/spark/sql/execution/CHDeltaColumnarWrite.scala b/backends-clickhouse/src/main/delta-23/org/apache/spark/sql/execution/CHDeltaColumnarWrite.scala new file mode 100644 index 0000000000000..0a1aee5c4bfb7 --- /dev/null +++ b/backends-clickhouse/src/main/delta-23/org/apache/spark/sql/execution/CHDeltaColumnarWrite.scala @@ -0,0 +1,30 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.sql.execution + +import org.apache.gluten.exception.GlutenNotSupportException + +import org.apache.spark.internal.io.FileCommitProtocol +import org.apache.spark.sql.execution.datasources.WriteJobDescription + +object CHDeltaColumnarWrite { + def apply( + jobTrackerID: String, + description: WriteJobDescription, + committer: FileCommitProtocol): CHColumnarWrite[FileCommitProtocol] = + throw new GlutenNotSupportException("Delta Native is not supported in Spark 3.3") +} diff --git a/backends-clickhouse/src/main/delta-32/org/apache/gluten/parser/GlutenCacheFilesSqlParser.scala b/backends-clickhouse/src/main/delta-32/org/apache/gluten/parser/GlutenCacheFilesSqlParser.scala new file mode 100644 index 0000000000000..9a0cde7728434 --- /dev/null +++ b/backends-clickhouse/src/main/delta-32/org/apache/gluten/parser/GlutenCacheFilesSqlParser.scala @@ -0,0 +1,65 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.gluten.parser + +import org.apache.spark.sql.SparkSession +import org.apache.spark.sql.catalyst.{FunctionIdentifier, TableIdentifier} +import org.apache.spark.sql.catalyst.expressions.Expression +import org.apache.spark.sql.catalyst.parser.ParserInterface +import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan +import org.apache.spark.sql.types.{DataType, StructType} + +class GlutenCacheFilesSqlParser(spark: SparkSession, delegate: ParserInterface) + extends GlutenCacheFileSqlParserBase { + + override def parsePlan(sqlText: String): LogicalPlan = + parse(sqlText) { + parser => + astBuilder.visit(parser.singleStatement()) match { + case plan: LogicalPlan => plan + case _ => delegate.parsePlan(sqlText) + } + } + + override def parseExpression(sqlText: String): Expression = { + delegate.parseExpression(sqlText) + } + + override def parseTableIdentifier(sqlText: String): TableIdentifier = { + delegate.parseTableIdentifier(sqlText) + } + + override def parseFunctionIdentifier(sqlText: String): FunctionIdentifier = { + delegate.parseFunctionIdentifier(sqlText) + } + + override def parseMultipartIdentifier(sqlText: String): Seq[String] = { + delegate.parseMultipartIdentifier(sqlText) + } + + override def parseTableSchema(sqlText: String): StructType = { + delegate.parseTableSchema(sqlText) + } + + override def parseDataType(sqlText: String): DataType = { + delegate.parseDataType(sqlText) + } + + override def parseQuery(sqlText: String): LogicalPlan = { + delegate.parseQuery(sqlText) + } +} diff --git a/backends-clickhouse/src/main/delta-32/org/apache/spark/sql/delta/ClickhouseOptimisticTransaction.scala b/backends-clickhouse/src/main/delta-32/org/apache/spark/sql/delta/ClickhouseOptimisticTransaction.scala index 6eec68efece3b..e023d3d7cbe76 100644 --- a/backends-clickhouse/src/main/delta-32/org/apache/spark/sql/delta/ClickhouseOptimisticTransaction.scala +++ b/backends-clickhouse/src/main/delta-32/org/apache/spark/sql/delta/ClickhouseOptimisticTransaction.scala @@ -25,17 +25,21 @@ import org.apache.spark.sql.catalyst.catalog.CatalogTable import org.apache.spark.sql.delta.actions._ import org.apache.spark.sql.delta.catalog.ClickHouseTableV2 import org.apache.spark.sql.delta.constraints.{Constraint, Constraints} -import org.apache.spark.sql.delta.files.MergeTreeCommitProtocol -import org.apache.spark.sql.delta.schema.InvariantViolationException +import org.apache.spark.sql.delta.files.{DelayedCommitProtocol, DeltaFileFormatWriter, MergeTreeCommitProtocol, TransactionalWrite} +import org.apache.spark.sql.delta.hooks.AutoCompact +import org.apache.spark.sql.delta.schema.{InnerInvariantViolationException, InvariantViolationException} import org.apache.spark.sql.delta.sources.DeltaSQLConf -import org.apache.spark.sql.execution.{SparkPlan, SQLExecution} +import org.apache.spark.sql.delta.stats.DeltaJobStatisticsTracker +import org.apache.spark.sql.execution.{CHDelayedCommitProtocol, QueryExecution, SparkPlan, SQLExecution} import org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanExec -import org.apache.spark.sql.execution.datasources.{BasicWriteJobStatsTracker, FakeRowAdaptor, FileFormatWriter, WriteJobStatsTracker} +import org.apache.spark.sql.execution.datasources.{BasicWriteJobStatsTracker, FakeRowAdaptor, FileFormatWriter, WriteFiles, WriteJobStatsTracker} import org.apache.spark.sql.execution.datasources.v1.clickhouse.MergeTreeFileFormatWriter import org.apache.spark.sql.execution.datasources.v2.clickhouse.ClickHouseConfig +import org.apache.spark.sql.internal.SQLConf import org.apache.spark.util.SerializableConfiguration import org.apache.commons.lang3.exception.ExceptionUtils +import org.apache.hadoop.fs.Path import scala.collection.mutable.ListBuffer @@ -190,4 +194,158 @@ class ClickhouseOptimisticTransaction( super.writeFiles(inputData, writeOptions, additionalConstraints) } } + + private def shouldOptimizeWrite( + writeOptions: Option[DeltaOptions], + sessionConf: SQLConf): Boolean = { + writeOptions + .flatMap(_.optimizeWrite) + .getOrElse(TransactionalWrite.shouldOptimizeWrite(metadata, sessionConf)) + } + + override protected def getCommitter(outputPath: Path): DelayedCommitProtocol = + new CHDelayedCommitProtocol("delta", outputPath.toString, None, deltaDataSubdir) + + override def writeFiles( + inputData: Dataset[_], + writeOptions: Option[DeltaOptions], + isOptimize: Boolean, + additionalConstraints: Seq[Constraint]): Seq[FileAction] = { + + if (isOptimize) + throw new UnsupportedOperationException("Optimize is not supported for ClickHouse") + + hasWritten = true + + val spark = inputData.sparkSession + val (data, partitionSchema) = performCDCPartition(inputData) + val outputPath = deltaLog.dataPath + + val fileFormat = deltaLog.fileFormat(protocol, metadata) // TODO support changing formats. + + // Iceberg spec requires partition columns in data files + val writePartitionColumns = IcebergCompat.isAnyEnabled(metadata) + // Retain only a minimal selection of Spark writer options to avoid any potential + // compatibility issues + val options = (writeOptions match { + case None => Map.empty[String, String] + case Some(writeOptions) => + writeOptions.options.filterKeys { + key => + key.equalsIgnoreCase(DeltaOptions.MAX_RECORDS_PER_FILE) || + key.equalsIgnoreCase(DeltaOptions.COMPRESSION) + }.toMap + }) + (DeltaOptions.WRITE_PARTITION_COLUMNS -> writePartitionColumns.toString) + + val (normalQueryExecution, output, generatedColumnConstraints, _) = + normalizeData(deltaLog, writeOptions, data) + val partitioningColumns = getPartitioningColumns(partitionSchema, output) + + val logicalPlan = normalQueryExecution.optimizedPlan + val write = + WriteFiles(logicalPlan, fileFormat, partitioningColumns, None, options, Map.empty) + + val queryExecution = new QueryExecution(spark, write) + val committer = getCommitter(outputPath) + + // If Statistics Collection is enabled, then create a stats tracker that will be injected during + // the FileFormatWriter.write call below and will collect per-file stats using + // StatisticsCollection + // val (optionalStatsTracker, _) = + // getOptionalStatsTrackerAndStatsCollection(output, outputPath, partitionSchema, data) + val optionalStatsTracker: Option[DeltaJobStatisticsTracker] = None + + val constraints = + Constraints.getAll(metadata, spark) ++ generatedColumnConstraints ++ additionalConstraints + + SQLExecution.withNewExecutionId(queryExecution, Option("deltaTransactionalWrite")) { + val outputSpec = FileFormatWriter.OutputSpec(outputPath.toString, Map.empty, output) + + val physicalPlan = materializeAdaptiveSparkPlan(queryExecution.executedPlan) + // convertEmptyToNullIfNeeded(queryExecution.executedPlan, partitioningColumns, constraints) + /* val checkInvariants = DeltaInvariantCheckerExec(empty2NullPlan, constraints) + // No need to plan optimized write if the write command is OPTIMIZE, which aims to produce + // evenly-balanced data files already. + val physicalPlan = + if ( + !isOptimize && + shouldOptimizeWrite(writeOptions, spark.sessionState.conf) + ) { + DeltaOptimizedWriterExec(checkInvariants, metadata.partitionColumns, deltaLog) + } else { + checkInvariants + }*/ + val statsTrackers: ListBuffer[WriteJobStatsTracker] = ListBuffer() + + if (spark.conf.get(DeltaSQLConf.DELTA_HISTORY_METRICS_ENABLED)) { + val basicWriteJobStatsTracker = new BasicWriteJobStatsTracker( + new SerializableConfiguration(deltaLog.newDeltaHadoopConf()), + BasicWriteJobStatsTracker.metrics) + registerSQLMetrics(spark, basicWriteJobStatsTracker.driverSideMetrics) + statsTrackers.append(basicWriteJobStatsTracker) + } + + try { + DeltaFileFormatWriter.write( + sparkSession = spark, + plan = physicalPlan, + fileFormat = fileFormat, + committer = committer, + outputSpec = outputSpec, + // scalastyle:off deltahadoopconfiguration + hadoopConf = + spark.sessionState.newHadoopConfWithOptions(metadata.configuration ++ deltaLog.options), + // scalastyle:on deltahadoopconfiguration + partitionColumns = partitioningColumns, + bucketSpec = None, + statsTrackers = optionalStatsTracker.toSeq + ++ statsTrackers, + options = options + ) + } catch { + case InnerInvariantViolationException(violationException) => + // Pull an InvariantViolationException up to the top level if it was the root cause. + throw violationException + } + } + + var resultFiles = + (if (optionalStatsTracker.isDefined) { + committer.addedStatuses.map { + a => + a.copy(stats = + optionalStatsTracker.map(_.recordedStats(a.toPath.getName)).getOrElse(a.stats)) + } + } else { + committer.addedStatuses + }) + .filter { + // In some cases, we can write out an empty `inputData`. Some examples of this (though, they + // may be fixed in the future) are the MERGE command when you delete with empty source, or + // empty target, or on disjoint tables. This is hard to catch before the write without + // collecting the DF ahead of time. Instead, we can return only the AddFiles that + // a) actually add rows, or + // b) don't have any stats so we don't know the number of rows at all + case a: AddFile => a.numLogicalRecords.forall(_ > 0) + case _ => true + } + + // add [[AddFile.Tags.ICEBERG_COMPAT_VERSION.name]] tags to addFiles + if (IcebergCompatV2.isEnabled(metadata)) { + resultFiles = resultFiles.map { + addFile => + val tags = if (addFile.tags != null) addFile.tags else Map.empty[String, String] + addFile.copy(tags = tags + (AddFile.Tags.ICEBERG_COMPAT_VERSION.name -> "2")) + } + } + + if (resultFiles.nonEmpty && !isOptimize) registerPostCommitHook(AutoCompact) + + resultFiles.toSeq ++ committer.changeFiles + } + + private def materializeAdaptiveSparkPlan(plan: SparkPlan): SparkPlan = plan match { + case a: AdaptiveSparkPlanExec => a.finalPhysicalPlan + case p: SparkPlan => p + } } diff --git a/backends-clickhouse/src/main/delta-32/org/apache/spark/sql/execution/CHDeltaColumnarWrite.scala b/backends-clickhouse/src/main/delta-32/org/apache/spark/sql/execution/CHDeltaColumnarWrite.scala new file mode 100644 index 0000000000000..66f502038fcdd --- /dev/null +++ b/backends-clickhouse/src/main/delta-32/org/apache/spark/sql/execution/CHDeltaColumnarWrite.scala @@ -0,0 +1,144 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.sql.execution + +import org.apache.gluten.backendsapi.BackendsApiManager +import org.apache.gluten.exception.GlutenNotSupportException + +import org.apache.spark.internal.Logging +import org.apache.spark.internal.io.FileCommitProtocol +import org.apache.spark.sql.delta.files.DelayedCommitProtocol +import org.apache.spark.sql.execution.datasources.{ExecutedWriteSummary, WriteJobDescription, WriteTaskResult} +import org.apache.spark.sql.vectorized.ColumnarBatch +import org.apache.spark.util.Utils + +import org.apache.hadoop.mapreduce.TaskAttemptContext + +import scala.collection.mutable +import scala.collection.mutable.ArrayBuffer + +/** A Wrapper of [[DelayedCommitProtocol]] for accessing protected methods and fields. */ +class CHDelayedCommitProtocol( + jobId: String, + val outputPath: String, + randomPrefixLength: Option[Int], + subdir: Option[String]) + extends DelayedCommitProtocol(jobId, outputPath, randomPrefixLength, subdir) { + + override def getFileName( + taskContext: TaskAttemptContext, + ext: String, + partitionValues: Map[String, String]): String = { + super.getFileName(taskContext, ext, partitionValues) + } + + def updateAddedFiles(files: Seq[(Map[String, String], String)]): Unit = { + assert(addedFiles.isEmpty) + addedFiles ++= files + } + + override def parsePartitions(dir: String): Map[String, String] = + super.parsePartitions(dir) +} + +case class CHDelayedCommitProtocolWrite( + override val jobTrackerID: String, + override val description: WriteJobDescription, + override val committer: CHDelayedCommitProtocol) + extends CHColumnarWrite[CHDelayedCommitProtocol] + with Logging { + + override def doSetupNativeTask(): Unit = { + assert(description.path == committer.outputPath) + val nameSpec = CreateFileNameSpec(taskAttemptContext, description) + val writePath = description.path + val writeFileName = committer.getFileName(taskAttemptContext, nameSpec.suffix, Map.empty) + logDebug(s"Native staging write path: $writePath and file name: $writeFileName") + BackendsApiManager.getIteratorApiInstance.injectWriteFilesTempPath(writePath, writeFileName) + } + + private def doCollectNativeResult( + cb: ColumnarBatch): Option[(Seq[(Map[String, String], String)], ExecutedWriteSummary)] = { + val numFiles = cb.numRows() + // Write an empty iterator + if (numFiles == 0) { + None + } else { + val file_col = cb.column(0) + val partition_col = cb.column(1) + val count_col = cb.column(2) + + val partitions: mutable.Set[String] = mutable.Set[String]() + val addedFiles: ArrayBuffer[(Map[String, String], String)] = + new ArrayBuffer[(Map[String, String], String)] + + var numWrittenRows: Long = 0 + Range(0, cb.numRows()).foreach { + i => + val fileName = file_col.getUTF8String(i).toString + val partition = partition_col.getUTF8String(i).toString + if (partition == "__NO_PARTITION_ID__") { + addedFiles.append((Map.empty[String, String], fileName)) + } else { + val partitionValues = committer.parsePartitions(partition) + addedFiles.append((partitionValues, s"$partition/$fileName")) + } + numWrittenRows += count_col.getLong(i) + } + val updatedPartitions = partitions.toSet + Some( + ( + addedFiles.toSeq, + ExecutedWriteSummary( + updatedPartitions = updatedPartitions, + stats = Seq(CreateBasicWriteTaskStats(numFiles, updatedPartitions, numWrittenRows))))) + } + } + + override def commitTask(batch: ColumnarBatch): Option[WriteTaskResult] = { + doCollectNativeResult(batch).map { + case (addedFiles, summary) => + require(addedFiles.nonEmpty, "No files to commit") + + committer.updateAddedFiles(addedFiles) + + val (taskCommitMessage, taskCommitTime) = Utils.timeTakenMs { + committer.commitTask(taskAttemptContext) + } + + // Just for update task commit time + description.statsTrackers.foreach { + stats => stats.newTaskInstance().getFinalStats(taskCommitTime) + } + WriteTaskResult(taskCommitMessage, summary) + } + } +} + +object CHDeltaColumnarWrite { + def apply( + jobTrackerID: String, + description: WriteJobDescription, + committer: FileCommitProtocol): CHColumnarWrite[FileCommitProtocol] = committer match { + case c: CHDelayedCommitProtocol => + CHDelayedCommitProtocolWrite(jobTrackerID, description, c) + .asInstanceOf[CHColumnarWrite[FileCommitProtocol]] + case _ => + throw new GlutenNotSupportException( + s"Unsupported committer type: ${committer.getClass.getSimpleName}") + } +} diff --git a/backends-clickhouse/src/main/java/org/apache/gluten/execution/CHNativeCacheManager.java b/backends-clickhouse/src/main/java/org/apache/gluten/execution/CHNativeCacheManager.java index f5f75dc1dca6d..4033d8c6b1ccc 100644 --- a/backends-clickhouse/src/main/java/org/apache/gluten/execution/CHNativeCacheManager.java +++ b/backends-clickhouse/src/main/java/org/apache/gluten/execution/CHNativeCacheManager.java @@ -19,9 +19,20 @@ import java.util.Set; public class CHNativeCacheManager { - public static void cacheParts(String table, Set columns, boolean async) { - nativeCacheParts(table, String.join(",", columns), async); + public static String cacheParts(String table, Set columns) { + return nativeCacheParts(table, String.join(",", columns)); } - private static native void nativeCacheParts(String table, String columns, boolean async); + private static native String nativeCacheParts(String table, String columns); + + public static CacheResult getCacheStatus(String jobId) { + return nativeGetCacheStatus(jobId); + } + + private static native CacheResult nativeGetCacheStatus(String jobId); + + public static native String nativeCacheFiles(byte[] files); + + // only for ut + public static native void removeFiles(String file, String cacheName); } diff --git a/backends-clickhouse/src/main/java/org/apache/gluten/execution/CacheResult.java b/backends-clickhouse/src/main/java/org/apache/gluten/execution/CacheResult.java new file mode 100644 index 0000000000000..b6d538039ec42 --- /dev/null +++ b/backends-clickhouse/src/main/java/org/apache/gluten/execution/CacheResult.java @@ -0,0 +1,62 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.gluten.execution; + +import java.io.Serializable; + +public class CacheResult implements Serializable { + public enum Status { + RUNNING(0), + SUCCESS(1), + ERROR(2); + + private final int value; + + Status(int value) { + this.value = value; + } + + public int getValue() { + return value; + } + + public static Status fromInt(int value) { + for (Status myEnum : Status.values()) { + if (myEnum.getValue() == value) { + return myEnum; + } + } + throw new IllegalArgumentException("No enum constant for value: " + value); + } + } + + private final Status status; + private final String message; + + public CacheResult(int status, String message) { + this.status = Status.fromInt(status); + this.message = message; + } + + public Status getStatus() { + return status; + } + + public String getMessage() { + return message; + } +} diff --git a/backends-clickhouse/src/main/java/org/apache/gluten/metrics/MetricsStep.java b/backends-clickhouse/src/main/java/org/apache/gluten/metrics/MetricsStep.java index f95aaa323c109..39dd949658358 100644 --- a/backends-clickhouse/src/main/java/org/apache/gluten/metrics/MetricsStep.java +++ b/backends-clickhouse/src/main/java/org/apache/gluten/metrics/MetricsStep.java @@ -35,6 +35,24 @@ public class MetricsStep { @JsonProperty("selected_marks") protected long selectedMarks; + @JsonProperty("read_cache_hits") + protected long readCacheHits; + + @JsonProperty("miss_cache_hits") + protected long missCacheHits; + + @JsonProperty("read_cache_bytes") + protected long readCacheBytes; + + @JsonProperty("read_miss_bytes") + protected long readMissBytes; + + @JsonProperty("read_cache_millisecond") + protected long readCacheMillisecond; + + @JsonProperty("miss_cache_millisecond") + protected long missCacheMillisecond; + public String getName() { return name; } @@ -82,4 +100,52 @@ public long getTotalMarksPk() { public long getSelectedMarksPk() { return selectedMarksPk; } + + public long getReadCacheHits() { + return readCacheHits; + } + + public void setReadCacheHits(long readCacheHits) { + this.readCacheHits = readCacheHits; + } + + public long getMissCacheHits() { + return missCacheHits; + } + + public void setMissCacheHits(long missCacheHits) { + this.missCacheHits = missCacheHits; + } + + public long getReadCacheBytes() { + return readCacheBytes; + } + + public void setReadCacheBytes(long readCacheBytes) { + this.readCacheBytes = readCacheBytes; + } + + public long getReadMissBytes() { + return readMissBytes; + } + + public void setReadMissBytes(long readMissBytes) { + this.readMissBytes = readMissBytes; + } + + public long getReadCacheMillisecond() { + return readCacheMillisecond; + } + + public void setReadCacheMillisecond(long readCacheMillisecond) { + this.readCacheMillisecond = readCacheMillisecond; + } + + public long getMissCacheMillisecond() { + return missCacheMillisecond; + } + + public void setMissCacheMillisecond(long missCacheMillisecond) { + this.missCacheMillisecond = missCacheMillisecond; + } } diff --git a/backends-clickhouse/src/main/java/org/apache/gluten/vectorized/BlockOutputStream.java b/backends-clickhouse/src/main/java/org/apache/gluten/vectorized/BlockOutputStream.java index 40e2c2c56b77f..e209010b2f856 100644 --- a/backends-clickhouse/src/main/java/org/apache/gluten/vectorized/BlockOutputStream.java +++ b/backends-clickhouse/src/main/java/org/apache/gluten/vectorized/BlockOutputStream.java @@ -38,6 +38,7 @@ public BlockOutputStream( SQLMetric dataSize, boolean compressionEnable, String defaultCompressionCodec, + int defaultCompressionLevel, int bufferSize) { OutputStream unwrapOutputStream = CHShuffleWriteStreamFactory.unwrapSparkCompressionOutputStream( @@ -50,7 +51,12 @@ public BlockOutputStream( } this.instance = nativeCreate( - this.outputStream, buffer, defaultCompressionCodec, compressionEnable, bufferSize); + this.outputStream, + buffer, + defaultCompressionCodec, + defaultCompressionLevel, + compressionEnable, + bufferSize); this.dataSize = dataSize; } @@ -58,6 +64,7 @@ private native long nativeCreate( OutputStream outputStream, byte[] buffer, String defaultCompressionCodec, + int defaultCompressionLevel, boolean compressionEnable, int bufferSize); diff --git a/backends-clickhouse/src/main/java/org/apache/gluten/vectorized/CHShuffleSplitterJniWrapper.java b/backends-clickhouse/src/main/java/org/apache/gluten/vectorized/CHShuffleSplitterJniWrapper.java index 864cc4eb70ace..7bc4f5dac6b82 100644 --- a/backends-clickhouse/src/main/java/org/apache/gluten/vectorized/CHShuffleSplitterJniWrapper.java +++ b/backends-clickhouse/src/main/java/org/apache/gluten/vectorized/CHShuffleSplitterJniWrapper.java @@ -27,6 +27,7 @@ public long make( long mapId, int bufferSize, String codec, + int level, String dataFile, String localDirs, int subDirsPerLocalDir, @@ -43,6 +44,7 @@ public long make( mapId, bufferSize, codec, + level, dataFile, localDirs, subDirsPerLocalDir, @@ -58,6 +60,7 @@ public long makeForRSS( long mapId, int bufferSize, String codec, + int level, long spillThreshold, String hashAlgorithm, Object pusher, @@ -71,6 +74,7 @@ public long makeForRSS( mapId, bufferSize, codec, + level, spillThreshold, hashAlgorithm, pusher, @@ -86,6 +90,7 @@ public native long nativeMake( long mapId, int bufferSize, String codec, + int level, String dataFile, String localDirs, int subDirsPerLocalDir, @@ -103,6 +108,7 @@ public native long nativeMakeForRSS( long mapId, int bufferSize, String codec, + int level, long spillThreshold, String hashAlgorithm, Object pusher, diff --git a/backends-clickhouse/src/main/java/org/apache/gluten/vectorized/StorageJoinBuilder.java b/backends-clickhouse/src/main/java/org/apache/gluten/vectorized/StorageJoinBuilder.java index ae7b89120cd4d..1c4c1302d2aef 100644 --- a/backends-clickhouse/src/main/java/org/apache/gluten/vectorized/StorageJoinBuilder.java +++ b/backends-clickhouse/src/main/java/org/apache/gluten/vectorized/StorageJoinBuilder.java @@ -17,6 +17,7 @@ package org.apache.gluten.vectorized; import org.apache.gluten.execution.BroadCastHashJoinContext; +import org.apache.gluten.execution.JoinTypeTransform; import org.apache.gluten.expression.ConverterUtils; import org.apache.gluten.expression.ConverterUtils$; import org.apache.gluten.substrait.type.TypeNode; @@ -80,7 +81,9 @@ public static long build( if (broadCastContext.buildHashTableId().startsWith("BuiltBNLJBroadcastTable-")) { joinType = SubstraitUtil.toCrossRelSubstrait(broadCastContext.joinType()).ordinal(); } else { - joinType = SubstraitUtil.toSubstrait(broadCastContext.joinType()).ordinal(); + boolean buildRight = broadCastContext.buildRight(); + joinType = + JoinTypeTransform.toSubstraitJoinType(broadCastContext.joinType(), buildRight).ordinal(); } return nativeBuild( diff --git a/backends-clickhouse/src/main/scala/org/apache/gluten/backendsapi/clickhouse/CHBackend.scala b/backends-clickhouse/src/main/scala/org/apache/gluten/backendsapi/clickhouse/CHBackend.scala index 06fe8c34ca4a5..86a69f8422808 100644 --- a/backends-clickhouse/src/main/scala/org/apache/gluten/backendsapi/clickhouse/CHBackend.scala +++ b/backends-clickhouse/src/main/scala/org/apache/gluten/backendsapi/clickhouse/CHBackend.scala @@ -29,6 +29,7 @@ import org.apache.spark.internal.Logging import org.apache.spark.sql.catalyst.catalog.BucketSpec import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.expressions.aggregate.AggregateExpression +import org.apache.spark.sql.catalyst.plans._ import org.apache.spark.sql.catalyst.plans.physical.{HashPartitioning, Partitioning} import org.apache.spark.sql.execution.SparkPlan import org.apache.spark.sql.execution.aggregate.HashAggregateExec @@ -52,6 +53,7 @@ class CHBackend extends Backend { override def validatorApi(): ValidatorApi = new CHValidatorApi override def metricsApi(): MetricsApi = new CHMetricsApi override def listenerApi(): ListenerApi = new CHListenerApi + override def ruleApi(): RuleApi = new CHRuleApi override def settings(): BackendSettingsApi = CHBackendSettings } @@ -140,10 +142,11 @@ object CHBackendSettings extends BackendSettingsApi with Logging { .toLowerCase(Locale.getDefault) } - override def supportFileFormatRead( + override def validateScan( format: ReadFileFormat, fields: Array[StructField], partTable: Boolean, + rootPaths: Seq[String], paths: Seq[String]): ValidationResult = { def validateFilePath: Boolean = { @@ -356,12 +359,50 @@ object CHBackendSettings extends BackendSettingsApi with Logging { .getLong(GLUTEN_MAX_SHUFFLE_READ_BYTES, GLUTEN_MAX_SHUFFLE_READ_BYTES_DEFAULT) } + // Reorder hash join tables, make sure to use the smaller table to build the hash table. + // Need to enable AQE + def enableReorderHashJoinTables(): Boolean = { + SparkEnv.get.conf.getBoolean( + "spark.gluten.sql.columnar.backend.ch.enable_reorder_hash_join_tables", + true + ) + } + // The threshold to reorder hash join tables, if The result of dividing two tables' size is + // large then this threshold, reorder the tables. e.g. a/b > threshold or b/a > threshold + def reorderHashJoinTablesThreshold(): Int = { + SparkEnv.get.conf.getInt( + "spark.gluten.sql.columnar.backend.ch.reorder_hash_join_tables_thresdhold", + 10 + ) + } + override def enableNativeWriteFiles(): Boolean = { GlutenConfig.getConf.enableNativeWriter.getOrElse(false) } - override def mergeTwoPhasesHashBaseAggregateIfNeed(): Boolean = true - override def supportCartesianProductExec(): Boolean = true + override def supportHashBuildJoinTypeOnLeft: JoinType => Boolean = { + t => + if (super.supportHashBuildJoinTypeOnLeft(t)) { + true + } else { + t match { + case LeftOuter => true + case _ => false + } + } + } + + override def supportHashBuildJoinTypeOnRight: JoinType => Boolean = { + t => + if (super.supportHashBuildJoinTypeOnRight(t)) { + true + } else { + t match { + case RightOuter => true + case _ => false + } + } + } } diff --git a/backends-clickhouse/src/main/scala/org/apache/gluten/backendsapi/clickhouse/CHIteratorApi.scala b/backends-clickhouse/src/main/scala/org/apache/gluten/backendsapi/clickhouse/CHIteratorApi.scala index 7519580b9cb74..c77d5726222c4 100644 --- a/backends-clickhouse/src/main/scala/org/apache/gluten/backendsapi/clickhouse/CHIteratorApi.scala +++ b/backends-clickhouse/src/main/scala/org/apache/gluten/backendsapi/clickhouse/CHIteratorApi.scala @@ -22,6 +22,7 @@ import org.apache.gluten.execution._ import org.apache.gluten.expression.ConverterUtils import org.apache.gluten.memory.CHThreadGroup import org.apache.gluten.metrics.{IMetrics, NativeMetrics} +import org.apache.gluten.sql.shims.SparkShimLoader import org.apache.gluten.substrait.plan.PlanNode import org.apache.gluten.substrait.rel._ import org.apache.gluten.substrait.rel.LocalFilesNode.ReadFileFormat @@ -164,6 +165,8 @@ class CHIteratorApi extends IteratorApi with Logging with LogLevelUtil { val paths = new JArrayList[String]() val starts = new JArrayList[JLong]() val lengths = new JArrayList[JLong]() + val fileSizes = new JArrayList[JLong]() + val modificationTimes = new JArrayList[JLong]() val partitionColumns = new JArrayList[JMap[String, String]] f.files.foreach { file => @@ -173,6 +176,16 @@ class CHIteratorApi extends IteratorApi with Logging with LogLevelUtil { // TODO: Support custom partition location val partitionColumn = new JHashMap[String, String]() partitionColumns.add(partitionColumn) + val (fileSize, modificationTime) = + SparkShimLoader.getSparkShims.getFileSizeAndModificationTime(file) + (fileSize, modificationTime) match { + case (Some(size), Some(time)) => + fileSizes.add(JLong.valueOf(size)) + modificationTimes.add(JLong.valueOf(time)) + case _ => + fileSizes.add(0) + modificationTimes.add(0) + } } val preferredLocations = CHAffinity.getFilePartitionLocations(paths.asScala.toArray, f.preferredLocations()) @@ -181,8 +194,8 @@ class CHIteratorApi extends IteratorApi with Logging with LogLevelUtil { paths, starts, lengths, - new JArrayList[JLong](), - new JArrayList[JLong](), + fileSizes, + modificationTimes, partitionColumns, new JArrayList[JMap[String, String]](), fileFormat, diff --git a/backends-clickhouse/src/main/scala/org/apache/gluten/backendsapi/clickhouse/CHMetricsApi.scala b/backends-clickhouse/src/main/scala/org/apache/gluten/backendsapi/clickhouse/CHMetricsApi.scala index 0ff53e1c58178..c53448cdd8586 100644 --- a/backends-clickhouse/src/main/scala/org/apache/gluten/backendsapi/clickhouse/CHMetricsApi.scala +++ b/backends-clickhouse/src/main/scala/org/apache/gluten/backendsapi/clickhouse/CHMetricsApi.scala @@ -128,7 +128,25 @@ class CHMetricsApi extends MetricsApi with Logging with LogLevelUtil { "extraTime" -> SQLMetrics.createTimingMetric(sparkContext, "extra operators time"), "selectedMarksPk" -> SQLMetrics.createMetric(sparkContext, "selected marks primary"), "selectedMarks" -> SQLMetrics.createMetric(sparkContext, "selected marks"), - "totalMarksPk" -> SQLMetrics.createMetric(sparkContext, "total marks primary") + "totalMarksPk" -> SQLMetrics.createMetric(sparkContext, "total marks primary"), + "readCacheHits" -> SQLMetrics.createMetric( + sparkContext, + "Number of times the read from filesystem cache hit the cache"), + "missCacheHits" -> SQLMetrics.createMetric( + sparkContext, + "Number of times the read from filesystem cache miss the cache"), + "readCacheBytes" -> SQLMetrics.createSizeMetric( + sparkContext, + "Bytes read from filesystem cache"), + "readMissBytes" -> SQLMetrics.createSizeMetric( + sparkContext, + "Bytes read from filesystem cache source (from remote fs, etc)"), + "readCacheMillisecond" -> SQLMetrics.createTimingMetric( + sparkContext, + "Time reading from filesystem cache"), + "missCacheMillisecond" -> SQLMetrics.createTimingMetric( + sparkContext, + "Time reading from filesystem cache source (from remote filesystem, etc)") ) override def genFileSourceScanTransformerMetricsUpdater( @@ -298,7 +316,31 @@ class CHMetricsApi extends MetricsApi with Logging with LogLevelUtil { override def genSortMergeJoinTransformerMetrics( sparkContext: SparkContext): Map[String, SQLMetric] = - Map.empty + Map( + "numOutputRows" -> SQLMetrics.createMetric(sparkContext, "number of output rows"), + "outputVectors" -> SQLMetrics.createMetric(sparkContext, "number of output vectors"), + "numOutputRows" -> SQLMetrics.createMetric(sparkContext, "number of output rows"), + "outputVectors" -> SQLMetrics.createMetric(sparkContext, "number of output vectors"), + "outputBytes" -> SQLMetrics.createSizeMetric(sparkContext, "number of output bytes"), + "numInputRows" -> SQLMetrics.createMetric(sparkContext, "number of input rows"), + "inputBytes" -> SQLMetrics.createSizeMetric(sparkContext, "number of input bytes"), + "extraTime" -> SQLMetrics.createTimingMetric(sparkContext, "extra operators time"), + "inputWaitTime" -> SQLMetrics.createTimingMetric(sparkContext, "time of waiting for data"), + "outputWaitTime" -> SQLMetrics.createTimingMetric(sparkContext, "time of waiting for output"), + "streamPreProjectionTime" -> + SQLMetrics.createTimingMetric(sparkContext, "time of stream side preProjection"), + "buildPreProjectionTime" -> + SQLMetrics.createTimingMetric(sparkContext, "time of build side preProjection"), + "postProjectTime" -> + SQLMetrics.createTimingMetric(sparkContext, "time of postProjection"), + "probeTime" -> + SQLMetrics.createTimingMetric(sparkContext, "time of probe"), + "totalTime" -> SQLMetrics.createTimingMetric(sparkContext, "time"), + "fillingRightJoinSideTime" -> SQLMetrics.createTimingMetric( + sparkContext, + "filling right join side time"), + "conditionTime" -> SQLMetrics.createTimingMetric(sparkContext, "join condition time") + ) override def genSortMergeJoinTransformerMetricsUpdater( metrics: Map[String, SQLMetric]): MetricsUpdater = new SortMergeJoinMetricsUpdater(metrics) diff --git a/backends-clickhouse/src/main/scala/org/apache/gluten/backendsapi/clickhouse/CHRuleApi.scala b/backends-clickhouse/src/main/scala/org/apache/gluten/backendsapi/clickhouse/CHRuleApi.scala new file mode 100644 index 0000000000000..fb5147157d94c --- /dev/null +++ b/backends-clickhouse/src/main/scala/org/apache/gluten/backendsapi/clickhouse/CHRuleApi.scala @@ -0,0 +1,111 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.gluten.backendsapi.clickhouse + +import org.apache.gluten.backendsapi.RuleApi +import org.apache.gluten.extension._ +import org.apache.gluten.extension.columnar._ +import org.apache.gluten.extension.columnar.MiscColumnarRules.{RemoveGlutenTableCacheColumnarToRow, RemoveTopmostColumnarToRow, RewriteSubqueryBroadcast, TransformPreOverrides} +import org.apache.gluten.extension.columnar.rewrite.RewriteSparkPlanRulesManager +import org.apache.gluten.extension.columnar.transition.{InsertTransitions, RemoveTransitions} +import org.apache.gluten.extension.injector.{RuleInjector, SparkInjector} +import org.apache.gluten.extension.injector.GlutenInjector.{LegacyInjector, RasInjector} +import org.apache.gluten.parser.{GlutenCacheFilesSqlParser, GlutenClickhouseSqlParser} +import org.apache.gluten.sql.shims.SparkShimLoader + +import org.apache.spark.sql.catalyst.{CHAggregateFunctionRewriteRule, EqualToRewrite} +import org.apache.spark.sql.execution.{ColumnarCollapseTransformStages, GlutenFallbackReporter} +import org.apache.spark.util.SparkPlanRules + +class CHRuleApi extends RuleApi { + import CHRuleApi._ + override def injectRules(injector: RuleInjector): Unit = { + injectSpark(injector.spark) + injectLegacy(injector.gluten.legacy) + injectRas(injector.gluten.ras) + } +} + +private object CHRuleApi { + def injectSpark(injector: SparkInjector): Unit = { + // Regular Spark rules. + injector.injectQueryStagePrepRule(FallbackBroadcastHashJoinPrepQueryStage.apply) + injector.injectParser( + (spark, parserInterface) => new GlutenCacheFilesSqlParser(spark, parserInterface)) + injector.injectParser( + (spark, parserInterface) => new GlutenClickhouseSqlParser(spark, parserInterface)) + injector.injectResolutionRule( + spark => new RewriteToDateExpresstionRule(spark, spark.sessionState.conf)) + injector.injectResolutionRule( + spark => new RewriteDateTimestampComparisonRule(spark, spark.sessionState.conf)) + injector.injectOptimizerRule( + spark => new CommonSubexpressionEliminateRule(spark, spark.sessionState.conf)) + injector.injectOptimizerRule(spark => CHAggregateFunctionRewriteRule(spark)) + injector.injectOptimizerRule(_ => CountDistinctWithoutExpand) + injector.injectOptimizerRule(_ => EqualToRewrite) + } + + def injectLegacy(injector: LegacyInjector): Unit = { + // Gluten columnar: Transform rules. + injector.injectTransform(_ => RemoveTransitions) + injector.injectTransform(c => FallbackOnANSIMode.apply(c.session)) + injector.injectTransform(c => FallbackMultiCodegens.apply(c.session)) + injector.injectTransform(_ => RewriteSubqueryBroadcast()) + injector.injectTransform(c => FallbackBroadcastHashJoin.apply(c.session)) + injector.injectTransform(c => MergeTwoPhasesHashBaseAggregate.apply(c.session)) + injector.injectTransform(_ => RewriteSparkPlanRulesManager()) + injector.injectTransform(_ => AddFallbackTagRule()) + injector.injectTransform(_ => TransformPreOverrides()) + injector.injectTransform(_ => RemoveNativeWriteFilesSortAndProject()) + injector.injectTransform(c => RewriteTransformer.apply(c.session)) + injector.injectTransform(_ => EnsureLocalSortRequirements) + injector.injectTransform(_ => EliminateLocalSort) + injector.injectTransform(_ => CollapseProjectExecTransformer) + injector.injectTransform(c => RewriteSortMergeJoinToHashJoinRule.apply(c.session)) + injector.injectTransform( + c => SparkPlanRules.extendedColumnarRule(c.conf.extendedColumnarTransformRules)(c.session)) + injector.injectTransform(c => InsertTransitions(c.outputsColumnar)) + + // Gluten columnar: Fallback policies. + injector.injectFallbackPolicy( + c => ExpandFallbackPolicy(c.ac.isAdaptiveContext(), c.ac.originalPlan())) + + // Gluten columnar: Post rules. + injector.injectPost(c => RemoveTopmostColumnarToRow(c.session, c.ac.isAdaptiveContext())) + SparkShimLoader.getSparkShims + .getExtendedColumnarPostRules() + .foreach(each => injector.injectPost(c => each(c.session))) + injector.injectPost(c => ColumnarCollapseTransformStages(c.conf)) + injector.injectTransform( + c => SparkPlanRules.extendedColumnarRule(c.conf.extendedColumnarPostRules)(c.session)) + + // Gluten columnar: Final rules. + injector.injectFinal(c => RemoveGlutenTableCacheColumnarToRow(c.session)) + injector.injectFinal(c => GlutenFallbackReporter(c.conf, c.session)) + injector.injectFinal(_ => RemoveFallbackTagRule()) + } + + def injectRas(injector: RasInjector): Unit = { + // CH backend doesn't work with RAS at the moment. Inject a rule that aborts any + // execution calls. + injector.inject( + _ => + new SparkPlanRules.AbortRule( + "Clickhouse backend doesn't yet have RAS support, please try disabling RAS and" + + " rerunning the application")) + } +} diff --git a/backends-clickhouse/src/main/scala/org/apache/gluten/backendsapi/clickhouse/CHSparkPlanExecApi.scala b/backends-clickhouse/src/main/scala/org/apache/gluten/backendsapi/clickhouse/CHSparkPlanExecApi.scala index bba5525edb955..6761269651c1b 100644 --- a/backends-clickhouse/src/main/scala/org/apache/gluten/backendsapi/clickhouse/CHSparkPlanExecApi.scala +++ b/backends-clickhouse/src/main/scala/org/apache/gluten/backendsapi/clickhouse/CHSparkPlanExecApi.scala @@ -18,45 +18,41 @@ package org.apache.gluten.backendsapi.clickhouse import org.apache.gluten.GlutenConfig import org.apache.gluten.backendsapi.{BackendsApiManager, SparkPlanExecApi} +import org.apache.gluten.exception.GlutenException import org.apache.gluten.exception.GlutenNotSupportException import org.apache.gluten.execution._ import org.apache.gluten.expression._ -import org.apache.gluten.extension.{CountDistinctWithoutExpand, FallbackBroadcastHashJoin, FallbackBroadcastHashJoinPrepQueryStage, RewriteSortMergeJoinToHashJoinRule, RewriteToDateExpresstionRule} import org.apache.gluten.extension.columnar.AddFallbackTagRule import org.apache.gluten.extension.columnar.MiscColumnarRules.TransformPreOverrides import org.apache.gluten.extension.columnar.transition.Convention -import org.apache.gluten.parser.GlutenClickhouseSqlParser import org.apache.gluten.sql.shims.SparkShimLoader import org.apache.gluten.substrait.expression.{ExpressionBuilder, ExpressionNode, WindowFunctionNode} import org.apache.gluten.utils.{CHJoinValidateUtil, UnknownJoinStrategy} import org.apache.gluten.vectorized.CHColumnarBatchSerializer -import org.apache.spark.{ShuffleDependency, SparkException} +import org.apache.spark.ShuffleDependency +import org.apache.spark.internal.Logging import org.apache.spark.rdd.RDD import org.apache.spark.serializer.Serializer import org.apache.spark.shuffle.{GenShuffleWriterParameters, GlutenShuffleWriterWrapper, HashPartitioningWrapper} import org.apache.spark.shuffle.utils.CHShuffleUtil -import org.apache.spark.sql.{SparkSession, Strategy} -import org.apache.spark.sql.catalyst.{CHAggregateFunctionRewriteRule, EqualToRewrite} +import org.apache.spark.sql.catalyst.catalog.BucketSpec +import org.apache.spark.sql.catalyst.catalog.CatalogTypes.TablePartitionSpec import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.expressions.aggregate.{AggregateExpression, CollectList, CollectSet} import org.apache.spark.sql.catalyst.optimizer.BuildSide -import org.apache.spark.sql.catalyst.parser.ParserInterface import org.apache.spark.sql.catalyst.plans.JoinType -import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan import org.apache.spark.sql.catalyst.plans.physical.{BroadcastMode, HashPartitioning, Partitioning, RangePartitioning} -import org.apache.spark.sql.catalyst.rules.Rule import org.apache.spark.sql.delta.files.TahoeFileIndex import org.apache.spark.sql.execution._ import org.apache.spark.sql.execution.adaptive.AQEShuffleReadExec -import org.apache.spark.sql.execution.datasources.{FileFormat, HadoopFsRelation, WriteJobDescription} +import org.apache.spark.sql.execution.datasources.{FileFormat, HadoopFsRelation} import org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat import org.apache.spark.sql.execution.datasources.v2.clickhouse.source.DeltaMergeTreeFileFormat import org.apache.spark.sql.execution.exchange.{BroadcastExchangeExec, ShuffleExchangeExec} import org.apache.spark.sql.execution.joins.{BuildSideRelation, ClickHouseBuildSideRelation, HashedRelationBroadcastMode} import org.apache.spark.sql.execution.metric.SQLMetric import org.apache.spark.sql.execution.utils.{CHExecUtil, PushDownUtil} -import org.apache.spark.sql.extension.{CommonSubexpressionEliminateRule, RewriteDateTimestampComparisonRule} import org.apache.spark.sql.types.{DecimalType, StructType} import org.apache.spark.sql.vectorized.ColumnarBatch @@ -68,7 +64,7 @@ import java.util.{ArrayList => JArrayList, List => JList, Map => JMap} import scala.collection.JavaConverters._ import scala.collection.mutable.ArrayBuffer -class CHSparkPlanExecApi extends SparkPlanExecApi { +class CHSparkPlanExecApi extends SparkPlanExecApi with Logging { /** The columnar-batch type this backend is using. */ override def batchType: Convention.BatchType = CHBatch @@ -308,7 +304,7 @@ class CHSparkPlanExecApi extends SparkPlanExecApi { condition: Option[Expression], left: SparkPlan, right: SparkPlan, - isSkewJoin: Boolean): ShuffledHashJoinExecTransformerBase = + isSkewJoin: Boolean): ShuffledHashJoinExecTransformerBase = { CHShuffledHashJoinExecTransformer( leftKeys, rightKeys, @@ -318,6 +314,7 @@ class CHSparkPlanExecApi extends SparkPlanExecApi { left, right, isSkewJoin) + } /** Generate BroadcastHashJoinExecTransformer. */ def genBroadcastHashJoinExecTransformer( @@ -432,6 +429,10 @@ class CHSparkPlanExecApi extends SparkPlanExecApi { } // scalastyle:on argcount + /** Determine whether to use sort-based shuffle based on shuffle partitioning and output. */ + override def useSortBasedShuffle(partitioning: Partitioning, output: Seq[Attribute]): Boolean = + false + /** * Generate ColumnarShuffleWriter for ColumnarShuffleManager. * @@ -533,92 +534,22 @@ class CHSparkPlanExecApi extends SparkPlanExecApi { CHExecUtil.buildSideRDD(dataSize, newChild).collect val batches = countsAndBytes.map(_._2) + val totalBatchesSize = batches.map(_.length).sum val rawSize = dataSize.value if (rawSize >= BroadcastExchangeExec.MAX_BROADCAST_TABLE_BYTES) { - throw new SparkException( - s"Cannot broadcast the table that is larger than 8GB: ${rawSize >> 30} GB") + throw new GlutenException( + s"Cannot broadcast the table that is larger than 8GB: $rawSize bytes") + } + if ((rawSize == 0 && totalBatchesSize != 0) || totalBatchesSize < 0) { + throw new GlutenException( + s"Invalid rawSize($rawSize) or totalBatchesSize ($totalBatchesSize). Ensure the shuffle" + + s" written bytes is correct.") } val rowCount = countsAndBytes.map(_._1).sum numOutputRows += rowCount ClickHouseBuildSideRelation(mode, newOutput, batches.flatten, rowCount, newBuildKeys) } - /** - * Generate extended DataSourceV2 Strategies. Currently only for ClickHouse backend. - * - * @return - */ - override def genExtendedDataSourceV2Strategies(): List[SparkSession => Strategy] = { - List.empty - } - - /** - * Generate extended query stage preparation rules. - * - * @return - */ - override def genExtendedQueryStagePrepRules(): List[SparkSession => Rule[SparkPlan]] = { - List(spark => FallbackBroadcastHashJoinPrepQueryStage(spark)) - } - - /** - * Generate extended Analyzers. Currently only for ClickHouse backend. - * - * @return - */ - override def genExtendedAnalyzers(): List[SparkSession => Rule[LogicalPlan]] = { - List( - spark => new RewriteToDateExpresstionRule(spark, spark.sessionState.conf), - spark => new RewriteDateTimestampComparisonRule(spark, spark.sessionState.conf)) - } - - /** - * Generate extended Optimizers. - * - * @return - */ - override def genExtendedOptimizers(): List[SparkSession => Rule[LogicalPlan]] = { - List( - spark => new CommonSubexpressionEliminateRule(spark, spark.sessionState.conf), - spark => CHAggregateFunctionRewriteRule(spark), - _ => CountDistinctWithoutExpand, - _ => EqualToRewrite - ) - } - - /** - * Generate extended columnar pre-rules, in the validation phase. - * - * @return - */ - override def genExtendedColumnarValidationRules(): List[SparkSession => Rule[SparkPlan]] = - List(spark => FallbackBroadcastHashJoin(spark)) - - /** - * Generate extended columnar pre-rules. - * - * @return - */ - override def genExtendedColumnarTransformRules(): List[SparkSession => Rule[SparkPlan]] = - List(spark => RewriteSortMergeJoinToHashJoinRule(spark)) - - override def genInjectPostHocResolutionRules(): List[SparkSession => Rule[LogicalPlan]] = { - List() - } - - /** - * Generate extended Strategies. - * - * @return - */ - override def genExtendedStrategies(): List[SparkSession => Strategy] = - List() - - override def genInjectExtendedParser() - : List[(SparkSession, ParserInterface) => ParserInterface] = { - List((spark, parserInterface) => new GlutenClickhouseSqlParser(spark, parserInterface)) - } - /** Define backend specfic expression mappings. */ override def extraExpressionMappings: Seq[Sig] = { List( @@ -682,8 +613,22 @@ class CHSparkPlanExecApi extends SparkPlanExecApi { CHRegExpReplaceTransformer(substraitExprName, children, expr) } - def createBackendWrite(description: WriteJobDescription): BackendWrite = ClickhouseBackendWrite( - description) + override def createColumnarWriteFilesExec( + child: SparkPlan, + noop: SparkPlan, + fileFormat: FileFormat, + partitionColumns: Seq[Attribute], + bucketSpec: Option[BucketSpec], + options: Map[String, String], + staticPartitions: TablePartitionSpec): ColumnarWriteFilesExec = + CHColumnarWriteFilesExec( + child, + noop, + fileFormat, + partitionColumns, + bucketSpec, + options, + staticPartitions) override def createColumnarArrowEvalPythonExec( udfs: Seq[PythonUDF], diff --git a/backends-clickhouse/src/main/scala/org/apache/gluten/backendsapi/clickhouse/package.scala b/backends-clickhouse/src/main/scala/org/apache/gluten/backendsapi/clickhouse/package.scala index 8704fac7bceeb..8975fb315fef7 100644 --- a/backends-clickhouse/src/main/scala/org/apache/gluten/backendsapi/clickhouse/package.scala +++ b/backends-clickhouse/src/main/scala/org/apache/gluten/backendsapi/clickhouse/package.scala @@ -21,6 +21,23 @@ import org.apache.gluten.extension.columnar.transition.Convention import org.apache.spark.sql.execution.{CHColumnarToRowExec, RowToCHNativeColumnarExec, SparkPlan} package object clickhouse { + + /** + * ClickHouse batch convention. + * + * [[fromRow]] and [[toRow]] need a [[TransitionDef]] instance. The scala allows an compact way to + * implement trait using a lambda function. + * + * Here the detail definition is given in [[CHBatch.fromRow]]. + * {{{ + * fromRow(new TransitionDef { + * override def create(): Transition = new Transition { + * override protected def apply0(plan: SparkPlan): SparkPlan = + * RowToCHNativeColumnarExec(plan) + * } + * }) + * }}} + */ case object CHBatch extends Convention.BatchType { fromRow( () => diff --git a/backends-clickhouse/src/main/scala/org/apache/gluten/execution/CHBroadcastNestedLoopJoinExecTransformer.scala b/backends-clickhouse/src/main/scala/org/apache/gluten/execution/CHBroadcastNestedLoopJoinExecTransformer.scala index 3aab5a6eb9986..abd87468f02aa 100644 --- a/backends-clickhouse/src/main/scala/org/apache/gluten/execution/CHBroadcastNestedLoopJoinExecTransformer.scala +++ b/backends-clickhouse/src/main/scala/org/apache/gluten/execution/CHBroadcastNestedLoopJoinExecTransformer.scala @@ -22,7 +22,7 @@ import org.apache.gluten.extension.ValidationResult import org.apache.spark.rdd.RDD import org.apache.spark.rpc.GlutenDriverEndpoint import org.apache.spark.sql.catalyst.expressions.Expression -import org.apache.spark.sql.catalyst.optimizer.BuildSide +import org.apache.spark.sql.catalyst.optimizer.{BuildRight, BuildSide} import org.apache.spark.sql.catalyst.plans._ import org.apache.spark.sql.catalyst.plans.{InnerLike, JoinType, LeftSemi} import org.apache.spark.sql.execution.{SparkPlan, SQLExecution} @@ -68,6 +68,7 @@ case class CHBroadcastNestedLoopJoinExecTransformer( BroadCastHashJoinContext( Seq.empty, finalJoinType, + buildSide == BuildRight, false, joinType.isInstanceOf[ExistenceJoin], buildPlan.output, diff --git a/backends-clickhouse/src/main/scala/org/apache/gluten/execution/CHHashAggregateExecTransformer.scala b/backends-clickhouse/src/main/scala/org/apache/gluten/execution/CHHashAggregateExecTransformer.scala index 7e688814381bc..6c1fee39c423b 100644 --- a/backends-clickhouse/src/main/scala/org/apache/gluten/execution/CHHashAggregateExecTransformer.scala +++ b/backends-clickhouse/src/main/scala/org/apache/gluten/execution/CHHashAggregateExecTransformer.scala @@ -370,8 +370,9 @@ case class CHHashAggregateExecTransformer( // Use approxPercentile.nullable as the nullable of the struct type // to make sure it returns null when input is empty fields = fields :+ (approxPercentile.child.dataType, approxPercentile.nullable) - fields = fields :+ (approxPercentile.percentageExpression.dataType, - approxPercentile.percentageExpression.nullable) + fields = fields :+ ( + approxPercentile.percentageExpression.dataType, + approxPercentile.percentageExpression.nullable) (makeStructType(fields), attr.nullable) case _ => (makeStructTypeSingleOne(attr.dataType, attr.nullable), attr.nullable) diff --git a/backends-clickhouse/src/main/scala/org/apache/gluten/execution/CHHashJoinExecTransformer.scala b/backends-clickhouse/src/main/scala/org/apache/gluten/execution/CHHashJoinExecTransformer.scala index ed946e1d263d7..2dd45281e4169 100644 --- a/backends-clickhouse/src/main/scala/org/apache/gluten/execution/CHHashJoinExecTransformer.scala +++ b/backends-clickhouse/src/main/scala/org/apache/gluten/execution/CHHashJoinExecTransformer.scala @@ -16,6 +16,7 @@ */ package org.apache.gluten.execution +import org.apache.gluten.backendsapi.BackendsApiManager import org.apache.gluten.extension.ValidationResult import org.apache.gluten.utils.{BroadcastHashJoinStrategy, CHJoinValidateUtil, ShuffleHashJoinStrategy} @@ -23,12 +24,13 @@ import org.apache.spark.{broadcast, SparkContext} import org.apache.spark.rdd.RDD import org.apache.spark.rpc.GlutenDriverEndpoint import org.apache.spark.sql.catalyst.expressions._ -import org.apache.spark.sql.catalyst.optimizer.BuildSide +import org.apache.spark.sql.catalyst.optimizer._ import org.apache.spark.sql.catalyst.plans._ import org.apache.spark.sql.execution.{SparkPlan, SQLExecution} import org.apache.spark.sql.execution.joins.BuildSideRelation import org.apache.spark.sql.vectorized.ColumnarBatch +import com.google.protobuf.{Any, StringValue} import io.substrait.proto.JoinRel object JoinTypeTransform { @@ -41,23 +43,39 @@ object JoinTypeTransform { } } - def toSubstraitType(joinType: JoinType): JoinRel.JoinType = { - joinType match { + def toSubstraitJoinType(sparkJoin: JoinType, buildRight: Boolean): JoinRel.JoinType = + sparkJoin match { case _: InnerLike => JoinRel.JoinType.JOIN_TYPE_INNER case FullOuter => JoinRel.JoinType.JOIN_TYPE_OUTER - case LeftOuter | RightOuter => - JoinRel.JoinType.JOIN_TYPE_LEFT + case LeftOuter => + if (!buildRight) { + JoinRel.JoinType.JOIN_TYPE_RIGHT + } else { + JoinRel.JoinType.JOIN_TYPE_LEFT + } + case RightOuter => + if (!buildRight) { + JoinRel.JoinType.JOIN_TYPE_LEFT + } else { + JoinRel.JoinType.JOIN_TYPE_RIGHT + } case LeftSemi | ExistenceJoin(_) => + if (!buildRight) { + throw new IllegalArgumentException("LeftSemi join should not switch children") + } JoinRel.JoinType.JOIN_TYPE_LEFT_SEMI case LeftAnti => + if (!buildRight) { + throw new IllegalArgumentException("LeftAnti join should not switch children") + } JoinRel.JoinType.JOIN_TYPE_ANTI case _ => // TODO: Support cross join with Cross Rel JoinRel.JoinType.UNRECOGNIZED } - } + } case class CHShuffledHashJoinExecTransformer( @@ -96,8 +114,68 @@ case class CHShuffledHashJoinExecTransformer( super.doValidateInternal() } private val finalJoinType = JoinTypeTransform.toNativeJoinType(joinType) - override protected lazy val substraitJoinType: JoinRel.JoinType = - JoinTypeTransform.toSubstraitType(joinType) + + override def genJoinParameters(): Any = { + val (isBHJ, isNullAwareAntiJoin, buildHashTableId): (Int, Int, String) = (0, 0, "") + + // Start with "JoinParameters:" + val joinParametersStr = new StringBuffer("JoinParameters:") + // isBHJ: 0 for SHJ, 1 for BHJ + // isNullAwareAntiJoin: 0 for false, 1 for true + // buildHashTableId: the unique id for the hash table of build plan + joinParametersStr + .append("isBHJ=") + .append(isBHJ) + .append("\n") + .append("isNullAwareAntiJoin=") + .append(isNullAwareAntiJoin) + .append("\n") + .append("buildHashTableId=") + .append(buildHashTableId) + .append("\n") + .append("isExistenceJoin=") + .append(if (joinType.isInstanceOf[ExistenceJoin]) 1 else 0) + .append("\n") + + CHAQEUtil.getShuffleQueryStageStats(streamedPlan) match { + case Some(stats) => + joinParametersStr + .append("leftRowCount=") + .append(stats.rowCount.getOrElse(-1)) + .append("\n") + .append("leftSizeInBytes=") + .append(stats.sizeInBytes) + .append("\n") + case _ => + } + CHAQEUtil.getShuffleQueryStageStats(buildPlan) match { + case Some(stats) => + joinParametersStr + .append("rightRowCount=") + .append(stats.rowCount.getOrElse(-1)) + .append("\n") + .append("rightSizeInBytes=") + .append(stats.sizeInBytes) + .append("\n") + case _ => + } + joinParametersStr + .append("numPartitions=") + .append(outputPartitioning.numPartitions) + .append("\n") + + val message = StringValue + .newBuilder() + .setValue(joinParametersStr.toString) + .build() + BackendsApiManager.getTransformerApiInstance.packPBMessage(message) + } + + override protected lazy val substraitJoinType: JoinRel.JoinType = { + val res = JoinTypeTransform.toSubstraitJoinType(joinType, buildSide == BuildRight) + logDebug(s"Convert join type from: $joinType:$buildSide to $res $needSwitchChildren") + res + } } case class CHBroadcastBuildSideRDD( @@ -115,6 +193,7 @@ case class CHBroadcastBuildSideRDD( case class BroadCastHashJoinContext( buildSideJoinKeys: Seq[Expression], joinType: JoinType, + buildRight: Boolean, hasMixedFiltCondition: Boolean, isExistenceJoin: Boolean, buildSideStructure: Seq[Attribute], @@ -177,6 +256,7 @@ case class CHBroadcastHashJoinExecTransformer( BroadCastHashJoinContext( buildKeyExprs, finalJoinType, + buildSide == BuildRight, isMixedCondition(condition), joinType.isInstanceOf[ExistenceJoin], buildPlan.output, @@ -204,6 +284,7 @@ case class CHBroadcastHashJoinExecTransformer( // We don't have left any join in substrait, so use left semi join instead. // and isExistenceJoin is set to true to indicate that it is an existence join. private val finalJoinType = JoinTypeTransform.toNativeJoinType(joinType) - override protected lazy val substraitJoinType: JoinRel.JoinType = - JoinTypeTransform.toSubstraitType(joinType) + override protected lazy val substraitJoinType: JoinRel.JoinType = { + JoinTypeTransform.toSubstraitJoinType(joinType, buildSide == BuildRight) + } } diff --git a/gluten-core/src/main/scala/org/apache/gluten/extension/CommonSubexpressionEliminateRule.scala b/backends-clickhouse/src/main/scala/org/apache/gluten/extension/CommonSubexpressionEliminateRule.scala similarity index 97% rename from gluten-core/src/main/scala/org/apache/gluten/extension/CommonSubexpressionEliminateRule.scala rename to backends-clickhouse/src/main/scala/org/apache/gluten/extension/CommonSubexpressionEliminateRule.scala index 8976681db2606..a3b74366fc7b7 100644 --- a/gluten-core/src/main/scala/org/apache/gluten/extension/CommonSubexpressionEliminateRule.scala +++ b/backends-clickhouse/src/main/scala/org/apache/gluten/extension/CommonSubexpressionEliminateRule.scala @@ -14,15 +14,14 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.spark.sql.extension +package org.apache.gluten.extension import org.apache.gluten.GlutenConfig import org.apache.spark.internal.Logging import org.apache.spark.sql.SparkSession import org.apache.spark.sql.catalyst.expressions._ -import org.apache.spark.sql.catalyst.expressions.aggregate.AggregateExpression -import org.apache.spark.sql.catalyst.expressions.aggregate.AggregateFunction +import org.apache.spark.sql.catalyst.expressions.aggregate.{AggregateExpression, AggregateFunction} import org.apache.spark.sql.catalyst.plans.logical._ import org.apache.spark.sql.catalyst.rules.Rule import org.apache.spark.sql.internal.SQLConf diff --git a/gluten-core/src/main/scala/org/apache/gluten/extension/CountDistinctWithoutExpand.scala b/backends-clickhouse/src/main/scala/org/apache/gluten/extension/CountDistinctWithoutExpand.scala similarity index 100% rename from gluten-core/src/main/scala/org/apache/gluten/extension/CountDistinctWithoutExpand.scala rename to backends-clickhouse/src/main/scala/org/apache/gluten/extension/CountDistinctWithoutExpand.scala diff --git a/gluten-core/src/main/scala/org/apache/gluten/extension/columnar/MergeTwoPhasesHashAggregate.scala b/backends-clickhouse/src/main/scala/org/apache/gluten/extension/MergeTwoPhasesHashBaseAggregate.scala similarity index 94% rename from gluten-core/src/main/scala/org/apache/gluten/extension/columnar/MergeTwoPhasesHashAggregate.scala rename to backends-clickhouse/src/main/scala/org/apache/gluten/extension/MergeTwoPhasesHashBaseAggregate.scala index e19cd09a01a3d..43adf27b4eb18 100644 --- a/gluten-core/src/main/scala/org/apache/gluten/extension/columnar/MergeTwoPhasesHashAggregate.scala +++ b/backends-clickhouse/src/main/scala/org/apache/gluten/extension/MergeTwoPhasesHashBaseAggregate.scala @@ -14,10 +14,9 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.gluten.extension.columnar +package org.apache.gluten.extension import org.apache.gluten.GlutenConfig -import org.apache.gluten.backendsapi.BackendsApiManager import org.apache.spark.sql.SparkSession import org.apache.spark.sql.catalyst.expressions.aggregate.{Complete, Final, Partial} @@ -39,7 +38,7 @@ case class MergeTwoPhasesHashBaseAggregate(session: SparkSession) extends Rule[S val columnarConf: GlutenConfig = GlutenConfig.getConf val scanOnly: Boolean = columnarConf.enableScanOnly val enableColumnarHashAgg: Boolean = !scanOnly && columnarConf.enableColumnarHashAgg - val replaceSortAggWithHashAgg = BackendsApiManager.getSettings.replaceSortAggWithHashAgg + val replaceSortAggWithHashAgg: Boolean = GlutenConfig.getConf.forceToUseHashAgg private def isPartialAgg(partialAgg: BaseAggregateExec, finalAgg: BaseAggregateExec): Boolean = { // TODO: now it can not support to merge agg which there are the filters in the aggregate exprs. @@ -57,10 +56,7 @@ case class MergeTwoPhasesHashBaseAggregate(session: SparkSession) extends Rule[S } override def apply(plan: SparkPlan): SparkPlan = { - if ( - !enableColumnarHashAgg || !BackendsApiManager.getSettings - .mergeTwoPhasesHashBaseAggregateIfNeed() - ) { + if (!enableColumnarHashAgg) { plan } else { plan.transformDown { diff --git a/gluten-core/src/main/scala/org/apache/gluten/extension/RewriteDateTimestampComparisonRule.scala b/backends-clickhouse/src/main/scala/org/apache/gluten/extension/RewriteDateTimestampComparisonRule.scala similarity index 99% rename from gluten-core/src/main/scala/org/apache/gluten/extension/RewriteDateTimestampComparisonRule.scala rename to backends-clickhouse/src/main/scala/org/apache/gluten/extension/RewriteDateTimestampComparisonRule.scala index 576ccf31b208f..ea92ddec2c8a1 100644 --- a/gluten-core/src/main/scala/org/apache/gluten/extension/RewriteDateTimestampComparisonRule.scala +++ b/backends-clickhouse/src/main/scala/org/apache/gluten/extension/RewriteDateTimestampComparisonRule.scala @@ -14,7 +14,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.spark.sql.extension +package org.apache.gluten.extension import org.apache.gluten.GlutenConfig @@ -27,8 +27,6 @@ import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types._ import org.apache.spark.unsafe.types.UTF8String -import java.lang.IllegalArgumentException - // For readable, people usually convert a unix timestamp into date, and compare it with another // date. For example // select * from table where '2023-11-02' >= from_unixtime(unix_timestamp, 'yyyy-MM-dd') diff --git a/gluten-core/src/main/scala/org/apache/gluten/extension/RewriteToDateExpresstionRule.scala b/backends-clickhouse/src/main/scala/org/apache/gluten/extension/RewriteToDateExpresstionRule.scala similarity index 100% rename from gluten-core/src/main/scala/org/apache/gluten/extension/RewriteToDateExpresstionRule.scala rename to backends-clickhouse/src/main/scala/org/apache/gluten/extension/RewriteToDateExpresstionRule.scala diff --git a/backends-clickhouse/src/main/scala/org/apache/gluten/metrics/FileSourceScanMetricsUpdater.scala b/backends-clickhouse/src/main/scala/org/apache/gluten/metrics/FileSourceScanMetricsUpdater.scala index f44c5ed1a1dde..4dcae8feb92bb 100644 --- a/backends-clickhouse/src/main/scala/org/apache/gluten/metrics/FileSourceScanMetricsUpdater.scala +++ b/backends-clickhouse/src/main/scala/org/apache/gluten/metrics/FileSourceScanMetricsUpdater.scala @@ -35,9 +35,15 @@ class FileSourceScanMetricsUpdater(@transient val metrics: Map[String, SQLMetric val extraTime: SQLMetric = metrics("extraTime") val inputWaitTime: SQLMetric = metrics("inputWaitTime") val outputWaitTime: SQLMetric = metrics("outputWaitTime") - val selected_marks_pk: SQLMetric = metrics("selectedMarksPk") - val selected_marks: SQLMetric = metrics("selectedMarks") - val total_marks_pk: SQLMetric = metrics("totalMarksPk") + val selectedMarksPK: SQLMetric = metrics("selectedMarksPk") + val selectedMarks: SQLMetric = metrics("selectedMarks") + val totalMarksPK: SQLMetric = metrics("totalMarksPk") + val readCacheHits: SQLMetric = metrics("readCacheHits") + val missCacheHits: SQLMetric = metrics("missCacheHits") + val readCacheBytes: SQLMetric = metrics("readCacheBytes") + val readMissBytes: SQLMetric = metrics("readMissBytes") + val readCacheMillisecond: SQLMetric = metrics("readCacheMillisecond") + val missCacheMillisecond: SQLMetric = metrics("missCacheMillisecond") override def updateInputMetrics(inputMetrics: InputMetricsWrapper): Unit = { // inputMetrics.bridgeIncBytesRead(metrics("inputBytes").value) @@ -56,9 +62,15 @@ class FileSourceScanMetricsUpdater(@transient val metrics: Map[String, SQLMetric metricsData.getSteps.forEach( step => { - selected_marks_pk += step.selectedMarksPk - selected_marks += step.selectedMarks - total_marks_pk += step.totalMarksPk + selectedMarksPK += step.selectedMarksPk + selectedMarks += step.selectedMarks + totalMarksPK += step.totalMarksPk + readCacheHits += step.readCacheHits + missCacheHits += step.missCacheHits + readCacheBytes += step.readCacheBytes + readMissBytes += step.readMissBytes + readCacheMillisecond += step.readCacheMillisecond + missCacheMillisecond += step.missCacheMillisecond }) MetricsUtil.updateExtraTimeMetric( diff --git a/backends-clickhouse/src/main/scala/org/apache/gluten/metrics/SortMergeJoinMetricsUpdater.scala b/backends-clickhouse/src/main/scala/org/apache/gluten/metrics/SortMergeJoinMetricsUpdater.scala index e8ebbbd2c9739..e5833a39bc58a 100644 --- a/backends-clickhouse/src/main/scala/org/apache/gluten/metrics/SortMergeJoinMetricsUpdater.scala +++ b/backends-clickhouse/src/main/scala/org/apache/gluten/metrics/SortMergeJoinMetricsUpdater.scala @@ -16,11 +16,103 @@ */ package org.apache.gluten.metrics +import org.apache.spark.internal.Logging import org.apache.spark.sql.execution.metric.SQLMetric -class SortMergeJoinMetricsUpdater(val metrics: Map[String, SQLMetric]) extends MetricsUpdater { +class SortMergeJoinMetricsUpdater(val metrics: Map[String, SQLMetric]) + extends MetricsUpdater + with Logging { override def updateNativeMetrics(opMetrics: IOperatorMetrics): Unit = { - if (opMetrics != null) {} + try { + if (opMetrics != null) { + val operatorMetrics = opMetrics.asInstanceOf[OperatorMetrics] + if (!operatorMetrics.metricsList.isEmpty && operatorMetrics.joinParams != null) { + val joinParams = operatorMetrics.joinParams + var currentIdx = operatorMetrics.metricsList.size() - 1 + var totalTime = 0L + + // build side pre projection + if (joinParams.buildPreProjectionNeeded) { + metrics("buildPreProjectionTime") += + (operatorMetrics.metricsList.get(currentIdx).time / 1000L).toLong + metrics("outputVectors") += operatorMetrics.metricsList.get(currentIdx).outputVectors + totalTime += operatorMetrics.metricsList.get(currentIdx).time + currentIdx -= 1 + } + + // stream side pre projection + if (joinParams.streamPreProjectionNeeded) { + metrics("streamPreProjectionTime") += + (operatorMetrics.metricsList.get(currentIdx).time / 1000L).toLong + metrics("outputVectors") += operatorMetrics.metricsList.get(currentIdx).outputVectors + totalTime += operatorMetrics.metricsList.get(currentIdx).time + currentIdx -= 1 + } + + // update fillingRightJoinSideTime + MetricsUtil + .getAllProcessorList(operatorMetrics.metricsList.get(currentIdx)) + .foreach( + processor => { + if (processor.name.equalsIgnoreCase("FillingRightJoinSide")) { + metrics("fillingRightJoinSideTime") += (processor.time / 1000L).toLong + } + }) + + // joining + val joinMetricsData = operatorMetrics.metricsList.get(currentIdx) + metrics("outputVectors") += joinMetricsData.outputVectors + metrics("inputWaitTime") += (joinMetricsData.inputWaitTime / 1000L).toLong + metrics("outputWaitTime") += (joinMetricsData.outputWaitTime / 1000L).toLong + totalTime += joinMetricsData.time + + MetricsUtil + .getAllProcessorList(joinMetricsData) + .foreach( + processor => { + if (processor.name.equalsIgnoreCase("FillingRightJoinSide")) { + metrics("fillingRightJoinSideTime") += (processor.time / 1000L).toLong + } + if (processor.name.equalsIgnoreCase("FilterTransform")) { + metrics("conditionTime") += (processor.time / 1000L).toLong + } + if (processor.name.equalsIgnoreCase("JoiningTransform")) { + metrics("probeTime") += (processor.time / 1000L).toLong + } + if (!SortMergeJoinMetricsUpdater.INCLUDING_PROCESSORS.contains(processor.name)) { + metrics("extraTime") += (processor.time / 1000L).toLong + } + if (SortMergeJoinMetricsUpdater.CH_PLAN_NODE_NAME.contains(processor.name)) { + metrics("numOutputRows") += processor.outputRows + metrics("outputBytes") += processor.outputBytes + metrics("numInputRows") += processor.inputRows + metrics("inputBytes") += processor.inputBytes + } + }) + + currentIdx -= 1 + + // post projection + if (joinParams.postProjectionNeeded) { + metrics("postProjectTime") += + (operatorMetrics.metricsList.get(currentIdx).time / 1000L).toLong + metrics("outputVectors") += operatorMetrics.metricsList.get(currentIdx).outputVectors + totalTime += operatorMetrics.metricsList.get(currentIdx).time + currentIdx -= 1 + } + metrics("totalTime") += (totalTime / 1000L).toLong + } + } + } catch { + case e: Exception => + logError(s"Updating native metrics failed due to ${e.getCause}.") + throw e + } } } + +object SortMergeJoinMetricsUpdater { + val INCLUDING_PROCESSORS = Array("JoiningTransform", "FillingRightJoinSide", "FilterTransform") + val CH_PLAN_NODE_NAME = Array("JoiningTransform") +} diff --git a/backends-clickhouse/src/main/scala/org/apache/gluten/parser/GlutenCacheFileSqlParserBase.scala b/backends-clickhouse/src/main/scala/org/apache/gluten/parser/GlutenCacheFileSqlParserBase.scala new file mode 100644 index 0000000000000..b031dcf7a1b49 --- /dev/null +++ b/backends-clickhouse/src/main/scala/org/apache/gluten/parser/GlutenCacheFileSqlParserBase.scala @@ -0,0 +1,234 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.gluten.parser + +import org.apache.gluten.sql.parser.{GlutenCacheFileSqlBaseBaseListener, GlutenCacheFileSqlBaseBaseVisitor, GlutenCacheFileSqlBaseLexer, GlutenCacheFileSqlBaseParser} + +import org.apache.spark.sql.AnalysisException +import org.apache.spark.sql.catalyst.parser.{ParseErrorListener, ParseException, ParserInterface} +import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan +import org.apache.spark.sql.catalyst.trees.Origin +import org.apache.spark.sql.execution.commands.GlutenCacheFilesCommand +import org.apache.spark.sql.internal.VariableSubstitution + +import org.antlr.v4.runtime._ +import org.antlr.v4.runtime.atn.PredictionMode +import org.antlr.v4.runtime.misc.ParseCancellationException +import org.antlr.v4.runtime.tree.TerminalNodeImpl + +import java.util.Locale + +import scala.collection.JavaConverters._ + +trait GlutenCacheFileSqlParserBase extends ParserInterface { + protected val astBuilder = new GlutenCacheFileSqlAstBuilder + protected val substitution = new VariableSubstitution + + protected def parse[T](command: String)(toResult: GlutenCacheFileSqlBaseParser => T): T = { + val lexer = new GlutenCacheFileSqlBaseLexer( + new UpperCaseCharStream(CharStreams.fromString(substitution.substitute(command)))) + lexer.removeErrorListeners() + lexer.addErrorListener(ParseErrorListener) + + val tokenStream = new CommonTokenStream(lexer) + val parser = new GlutenCacheFileSqlBaseParser(tokenStream) + + parser.addParseListener(GlutenCacheFileSqlPostProcessor) + parser.removeErrorListeners() + parser.addErrorListener(ParseErrorListener) + + try { + try { + // first, try parsing with potentially faster SLL mode + parser.getInterpreter.setPredictionMode(PredictionMode.SLL) + toResult(parser) + } catch { + case e: ParseCancellationException => + // if we fail, parse with LL mode + tokenStream.seek(0) // rewind input stream + parser.reset() + + // Try Again. + parser.getInterpreter.setPredictionMode(PredictionMode.LL) + toResult(parser) + } + } catch { + case e: ParseException if e.command.isDefined => + throw e + case e: ParseException => + throw e.withCommand(command) + case e: AnalysisException => + val position = Origin(e.line, e.startPosition) + throw new ParseException( + command = Option(command), + message = e.message, + start = position, + stop = position, + errorClass = Some("GLUTEN_CACHE_FILE_PARSING_ANALYSIS_ERROR")) + } + } +} + +class GlutenCacheFileSqlAstBuilder extends GlutenCacheFileSqlBaseBaseVisitor[AnyRef] { + import org.apache.spark.sql.catalyst.parser.ParserUtils._ + + /** Convert a property list into a key-value map. */ + override def visitPropertyList( + ctx: GlutenCacheFileSqlBaseParser.PropertyListContext): Map[String, String] = + withOrigin(ctx) { + val properties = ctx.property.asScala.map { + property => + val key = visitPropertyKey(property.key) + val value = visitPropertyValue(property.value) + key -> value + } + // Check for duplicate property names. + checkDuplicateKeys(properties.toSeq, ctx) + properties.toMap + } + + /** + * A property key can either be String or a collection of dot separated elements. This function + * extracts the property key based on whether its a string literal or a property identifier. + */ + override def visitPropertyKey(key: GlutenCacheFileSqlBaseParser.PropertyKeyContext): String = { + if (key.stringLit() != null) { + string(visitStringLit(key.stringLit())) + } else { + key.getText + } + } + + /** + * A property value can be String, Integer, Boolean or Decimal. This function extracts the + * property value based on whether its a string, integer, boolean or decimal literal. + */ + override def visitPropertyValue( + value: GlutenCacheFileSqlBaseParser.PropertyValueContext): String = { + if (value == null) { + null + } else if (value.identifier != null) { + value.identifier.getText + } else if (value.value != null) { + string(visitStringLit(value.value)) + } else if (value.booleanValue != null) { + value.getText.toLowerCase(Locale.ROOT) + } else { + value.getText + } + } + + def visitPropertyKeyValues( + ctx: GlutenCacheFileSqlBaseParser.PropertyListContext): Map[String, String] = { + val props = visitPropertyList(ctx) + val badKeys = props.collect { case (key, null) => key } + if (badKeys.nonEmpty) { + operationNotAllowed( + s"Values must be specified for key(s): ${badKeys.mkString("[", ",", "]")}", + ctx) + } + props + } + + override def visitStringLit(ctx: GlutenCacheFileSqlBaseParser.StringLitContext): Token = { + if (ctx != null) { + if (ctx.STRING != null) { + ctx.STRING.getSymbol + } else { + ctx.DOUBLEQUOTED_STRING.getSymbol + } + } else { + null + } + } + + override def visitSingleStatement( + ctx: GlutenCacheFileSqlBaseParser.SingleStatementContext): AnyRef = withOrigin(ctx) { + visit(ctx.statement).asInstanceOf[LogicalPlan] + } + + override def visitCacheFiles(ctx: GlutenCacheFileSqlBaseParser.CacheFilesContext): AnyRef = + withOrigin(ctx) { + val asynExecute = ctx.ASYNC != null + val selectedColuman = visitSelectedColumnNames(ctx.selectedColumns) + val propertyOverrides = Option(ctx.cacheProps) + .map(visitPropertyKeyValues) + .getOrElse(Map.empty[String, String]) + val path = ctx.path.getText + + GlutenCacheFilesCommand( + asynExecute, + selectedColuman, + path.substring(1, path.length - 1), + propertyOverrides + ) + } + + override def visitPassThrough(ctx: GlutenCacheFileSqlBaseParser.PassThroughContext): AnyRef = + null + + override def visitSelectedColumnNames( + ctx: GlutenCacheFileSqlBaseParser.SelectedColumnNamesContext): Option[Seq[String]] = + withOrigin(ctx) { + if (ctx != null) { + if (ctx.ASTERISK != null) { + // It means select all columns + None + } else if (ctx.identifier != null && !(ctx.identifier).isEmpty) { + Some(ctx.identifier.asScala.map(_.getText).toSeq) + } else { + throw new ParseException(s"Illegal selected column.", ctx) + } + } else { + throw new ParseException(s"Illegal selected column.", ctx) + } + } +} + +case object GlutenCacheFileSqlPostProcessor extends GlutenCacheFileSqlBaseBaseListener { + + /** Remove the back ticks from an Identifier. */ + override def exitQuotedIdentifier( + ctx: GlutenCacheFileSqlBaseParser.QuotedIdentifierContext): Unit = { + replaceTokenByIdentifier(ctx, 1) { + token => + // Remove the double back ticks in the string. + token.setText(token.getText.replace("``", "`")) + token + } + } + + /** Treat non-reserved keywords as Identifiers. */ + override def exitNonReserved(ctx: GlutenCacheFileSqlBaseParser.NonReservedContext): Unit = { + replaceTokenByIdentifier(ctx, 0)(identity) + } + + private def replaceTokenByIdentifier(ctx: ParserRuleContext, stripMargins: Int)( + f: CommonToken => CommonToken = identity): Unit = { + val parent = ctx.getParent + parent.removeLastChild() + val token = ctx.getChild(0).getPayload.asInstanceOf[Token] + val newToken = new CommonToken( + new org.antlr.v4.runtime.misc.Pair(token.getTokenSource, token.getInputStream), + GlutenCacheFileSqlBaseParser.IDENTIFIER, + token.getChannel, + token.getStartIndex + stripMargins, + token.getStopIndex - stripMargins + ) + parent.addChild(new TerminalNodeImpl(f(newToken))) + } +} diff --git a/backends-clickhouse/src/main/scala/org/apache/gluten/parser/GlutenClickhouseSqlParserBase.scala b/backends-clickhouse/src/main/scala/org/apache/gluten/parser/GlutenClickhouseSqlParserBase.scala index 18fc102bec3d1..4a3883c8cc2b7 100644 --- a/backends-clickhouse/src/main/scala/org/apache/gluten/parser/GlutenClickhouseSqlParserBase.scala +++ b/backends-clickhouse/src/main/scala/org/apache/gluten/parser/GlutenClickhouseSqlParserBase.scala @@ -29,7 +29,7 @@ import org.apache.spark.sql.internal.VariableSubstitution import org.antlr.v4.runtime._ import org.antlr.v4.runtime.atn.PredictionMode -import org.antlr.v4.runtime.misc.{Interval, ParseCancellationException} +import org.antlr.v4.runtime.misc.ParseCancellationException import org.antlr.v4.runtime.tree.TerminalNodeImpl import java.util.Locale @@ -256,21 +256,3 @@ case object PostProcessor extends GlutenClickhouseSqlBaseBaseListener { parent.addChild(new TerminalNodeImpl(f(newToken))) } } - -class UpperCaseCharStream(wrapped: CodePointCharStream) extends CharStream { - override def consume(): Unit = wrapped.consume - override def getSourceName(): String = wrapped.getSourceName - override def index(): Int = wrapped.index - override def mark(): Int = wrapped.mark - override def release(marker: Int): Unit = wrapped.release(marker) - override def seek(where: Int): Unit = wrapped.seek(where) - override def size(): Int = wrapped.size - - override def getText(interval: Interval): String = wrapped.getText(interval) - - override def LA(i: Int): Int = { - val la = wrapped.LA(i) - if (la == 0 || la == IntStream.EOF) la - else Character.toUpperCase(la) - } -} diff --git a/backends-clickhouse/src/main/scala/org/apache/gluten/parser/UpperCaseCharStream.scala b/backends-clickhouse/src/main/scala/org/apache/gluten/parser/UpperCaseCharStream.scala new file mode 100644 index 0000000000000..6ee2aac81c74a --- /dev/null +++ b/backends-clickhouse/src/main/scala/org/apache/gluten/parser/UpperCaseCharStream.scala @@ -0,0 +1,38 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.gluten.parser + +import org.antlr.v4.runtime.{CharStream, CodePointCharStream, IntStream} +import org.antlr.v4.runtime.misc.Interval + +class UpperCaseCharStream(wrapped: CodePointCharStream) extends CharStream { + override def consume(): Unit = wrapped.consume + override def getSourceName(): String = wrapped.getSourceName + override def index(): Int = wrapped.index + override def mark(): Int = wrapped.mark + override def release(marker: Int): Unit = wrapped.release(marker) + override def seek(where: Int): Unit = wrapped.seek(where) + override def size(): Int = wrapped.size + + override def getText(interval: Interval): String = wrapped.getText(interval) + + override def LA(i: Int): Int = { + val la = wrapped.LA(i) + if (la == 0 || la == IntStream.EOF) la + else Character.toUpperCase(la) + } +} diff --git a/backends-clickhouse/src/main/scala/org/apache/gluten/utils/CHAQEUtil.scala b/backends-clickhouse/src/main/scala/org/apache/gluten/utils/CHAQEUtil.scala new file mode 100644 index 0000000000000..9a35517f54fc0 --- /dev/null +++ b/backends-clickhouse/src/main/scala/org/apache/gluten/utils/CHAQEUtil.scala @@ -0,0 +1,39 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.gluten.execution + +import org.apache.spark.sql.catalyst.plans.logical._ +import org.apache.spark.sql.execution._ +import org.apache.spark.sql.execution.adaptive._ + +object CHAQEUtil { + + // All TransformSupports have lost the logicalLink. So we need iterate the plan to find the + // first ShuffleQueryStageExec and get the runtime stats. + def getShuffleQueryStageStats(plan: SparkPlan): Option[Statistics] = { + plan match { + case stage: ShuffleQueryStageExec => + Some(stage.getRuntimeStatistics) + case _ => + if (plan.children.length == 1) { + getShuffleQueryStageStats(plan.children.head) + } else { + None + } + } + } +} diff --git a/backends-clickhouse/src/main/scala/org/apache/gluten/utils/CHExpressionUtil.scala b/backends-clickhouse/src/main/scala/org/apache/gluten/utils/CHExpressionUtil.scala index d65de1cea151d..ae072b0fbe85e 100644 --- a/backends-clickhouse/src/main/scala/org/apache/gluten/utils/CHExpressionUtil.scala +++ b/backends-clickhouse/src/main/scala/org/apache/gluten/utils/CHExpressionUtil.scala @@ -166,6 +166,13 @@ case class ArrayJoinValidator() extends FunctionValidator { } } +case class FormatStringValidator() extends FunctionValidator { + override def doValidate(expr: Expression): Boolean = { + val formatString = expr.asInstanceOf[FormatString] + formatString.children.head.isInstanceOf[Literal] + } +} + object CHExpressionUtil { final val CH_AGGREGATE_FUNC_BLACKLIST: Map[String, FunctionValidator] = Map( @@ -199,6 +206,7 @@ object CHExpressionUtil { SPARK_PARTITION_ID -> DefaultValidator(), URL_DECODE -> DefaultValidator(), URL_ENCODE -> DefaultValidator(), + FORMAT_STRING -> FormatStringValidator(), SKEWNESS -> DefaultValidator(), SOUNDEX -> DefaultValidator(), MAKE_YM_INTERVAL -> DefaultValidator(), diff --git a/backends-clickhouse/src/main/scala/org/apache/gluten/vectorized/CHColumnarBatchSerializer.scala b/backends-clickhouse/src/main/scala/org/apache/gluten/vectorized/CHColumnarBatchSerializer.scala index f640bfd2d7f19..fa6f8addf163a 100644 --- a/backends-clickhouse/src/main/scala/org/apache/gluten/vectorized/CHColumnarBatchSerializer.scala +++ b/backends-clickhouse/src/main/scala/org/apache/gluten/vectorized/CHColumnarBatchSerializer.scala @@ -54,8 +54,14 @@ private class CHColumnarBatchSerializerInstance( extends SerializerInstance with Logging { - private lazy val compressionCodec = - GlutenShuffleUtils.getCompressionCodec(SparkEnv.get.conf).toUpperCase(Locale.ROOT) + private lazy val conf = SparkEnv.get.conf + private lazy val compressionCodec = GlutenShuffleUtils.getCompressionCodec(conf) + private lazy val capitalizedCompressionCodec = compressionCodec.toUpperCase(Locale.ROOT) + private lazy val compressionLevel = + GlutenShuffleUtils.getCompressionLevel( + conf, + compressionCodec, + GlutenConfig.getConf.columnarShuffleCodecBackend.orNull) override def deserializeStream(in: InputStream): DeserializationStream = { new DeserializationStream { @@ -136,7 +142,8 @@ private class CHColumnarBatchSerializerInstance( writeBuffer, dataSize, CHBackendSettings.useCustomizedShuffleCodec, - compressionCodec, + capitalizedCompressionCodec, + compressionLevel, CHBackendSettings.customizeBufferSize ) diff --git a/backends-clickhouse/src/main/scala/org/apache/spark/rpc/GlutenExecutorEndpoint.scala b/backends-clickhouse/src/main/scala/org/apache/spark/rpc/GlutenExecutorEndpoint.scala index 4d90ab6533ba7..7f2b94eea314e 100644 --- a/backends-clickhouse/src/main/scala/org/apache/spark/rpc/GlutenExecutorEndpoint.scala +++ b/backends-clickhouse/src/main/scala/org/apache/spark/rpc/GlutenExecutorEndpoint.scala @@ -64,8 +64,6 @@ class GlutenExecutorEndpoint(val executorId: String, val conf: SparkConf) hashIds.forEach( resource_id => CHBroadcastBuildSideCache.invalidateBroadcastHashtable(resource_id)) } - case GlutenMergeTreeCacheLoad(mergeTreeTable, columns) => - CHNativeCacheManager.cacheParts(mergeTreeTable, columns, true) case e => logError(s"Received unexpected message. $e") @@ -74,11 +72,26 @@ class GlutenExecutorEndpoint(val executorId: String, val conf: SparkConf) override def receiveAndReply(context: RpcCallContext): PartialFunction[Any, Unit] = { case GlutenMergeTreeCacheLoad(mergeTreeTable, columns) => try { - CHNativeCacheManager.cacheParts(mergeTreeTable, columns, false) - context.reply(CacheLoadResult(true)) + val jobId = CHNativeCacheManager.cacheParts(mergeTreeTable, columns) + context.reply(CacheJobInfo(status = true, jobId)) } catch { case _: Exception => - context.reply(CacheLoadResult(false, s"executor: $executorId cache data failed.")) + context.reply( + CacheJobInfo(status = false, "", s"executor: $executorId cache data failed.")) + } + case GlutenCacheLoadStatus(jobId) => + val status = CHNativeCacheManager.getCacheStatus(jobId) + context.reply(status) + case GlutenFilesCacheLoad(files) => + try { + val jobId = CHNativeCacheManager.nativeCacheFiles(files) + context.reply(CacheJobInfo(status = true, jobId)) + } catch { + case e: Exception => + context.reply( + CacheJobInfo( + status = false, + s"executor: $executorId cache data failed. ${e.getMessage}")) } case e => logError(s"Received unexpected message. $e") diff --git a/backends-clickhouse/src/main/scala/org/apache/spark/rpc/GlutenRpcMessages.scala b/backends-clickhouse/src/main/scala/org/apache/spark/rpc/GlutenRpcMessages.scala index d675d705f10a2..e596e94fed722 100644 --- a/backends-clickhouse/src/main/scala/org/apache/spark/rpc/GlutenRpcMessages.scala +++ b/backends-clickhouse/src/main/scala/org/apache/spark/rpc/GlutenRpcMessages.scala @@ -35,8 +35,16 @@ object GlutenRpcMessages { case class GlutenCleanExecutionResource(executionId: String, broadcastHashIds: util.Set[String]) extends GlutenRpcMessage + // for mergetree cache case class GlutenMergeTreeCacheLoad(mergeTreeTable: String, columns: util.Set[String]) extends GlutenRpcMessage - case class CacheLoadResult(success: Boolean, reason: String = "") extends GlutenRpcMessage + case class GlutenCacheLoadStatus(jobId: String) + + case class CacheJobInfo(status: Boolean, jobId: String, reason: String = "") + extends GlutenRpcMessage + + case class GlutenFilesCacheLoad(files: Array[Byte]) extends GlutenRpcMessage + + case class GlutenFilesCacheLoadStatus(jobId: String) } diff --git a/backends-clickhouse/src/main/scala/org/apache/spark/shuffle/CHColumnarShuffleWriter.scala b/backends-clickhouse/src/main/scala/org/apache/spark/shuffle/CHColumnarShuffleWriter.scala index db9bba5f170a3..758c487a18aa4 100644 --- a/backends-clickhouse/src/main/scala/org/apache/spark/shuffle/CHColumnarShuffleWriter.scala +++ b/backends-clickhouse/src/main/scala/org/apache/spark/shuffle/CHColumnarShuffleWriter.scala @@ -51,8 +51,13 @@ class CHColumnarShuffleWriter[K, V]( .mkString(",") private val subDirsPerLocalDir = blockManager.diskBlockManager.subDirsPerLocalDir private val splitSize = GlutenConfig.getConf.maxBatchSize - private val customizedCompressCodec = - GlutenShuffleUtils.getCompressionCodec(conf).toUpperCase(Locale.ROOT) + private val compressionCodec = GlutenShuffleUtils.getCompressionCodec(conf) + private val capitalizedCompressionCodec = compressionCodec.toUpperCase(Locale.ROOT) + private val compressionLevel = + GlutenShuffleUtils.getCompressionLevel( + conf, + compressionCodec, + GlutenConfig.getConf.columnarShuffleCodecBackend.orNull) private val maxSortBufferSize = GlutenConfig.getConf.chColumnarMaxSortBufferSize private val forceMemorySortShuffle = GlutenConfig.getConf.chColumnarForceMemorySortShuffle private val spillThreshold = GlutenConfig.getConf.chColumnarShuffleSpillThreshold @@ -98,7 +103,8 @@ class CHColumnarShuffleWriter[K, V]( dep.shuffleId, mapId, splitSize, - customizedCompressCodec, + capitalizedCompressionCodec, + compressionLevel, dataTmp.getAbsolutePath, localDirs, subDirsPerLocalDir, @@ -178,6 +184,6 @@ class CHColumnarShuffleWriter[K, V]( } // VisibleForTesting - def getPartitionLengths: Array[Long] = partitionLengths + def getPartitionLengths(): Array[Long] = partitionLengths } diff --git a/backends-clickhouse/src/main/scala/org/apache/spark/sql/delta/catalog/ClickHouseTableV2Base.scala b/backends-clickhouse/src/main/scala/org/apache/spark/sql/delta/catalog/ClickHouseTableV2Base.scala index 633d23f77b1ba..1ab2e12d43e30 100644 --- a/backends-clickhouse/src/main/scala/org/apache/spark/sql/delta/catalog/ClickHouseTableV2Base.scala +++ b/backends-clickhouse/src/main/scala/org/apache/spark/sql/delta/catalog/ClickHouseTableV2Base.scala @@ -16,6 +16,7 @@ */ package org.apache.spark.sql.delta.catalog +import org.apache.gluten.expression.ConverterUtils import org.apache.gluten.expression.ConverterUtils.normalizeColName import org.apache.spark.sql.catalyst.catalog.{BucketSpec, CatalogTable} @@ -49,10 +50,9 @@ trait ClickHouseTableV2Base { if (tableProperties.containsKey("numBuckets")) { val numBuckets = tableProperties.get("numBuckets").trim.toInt val bucketColumnNames: Seq[String] = - tableProperties.get("bucketColumnNames").split(",").map(_.trim).toSeq - val sortColumnNames: Seq[String] = if (tableProperties.containsKey("orderByKey")) { - tableProperties.get("orderByKey").split(",").map(_.trim).toSeq - } else Seq.empty[String] + getCommaSeparatedColumns("bucketColumnNames").getOrElse(Seq.empty[String]) + val sortColumnNames: Seq[String] = + getCommaSeparatedColumns("orderByKey").getOrElse(Seq.empty[String]) Some(BucketSpec(numBuckets, bucketColumnNames, sortColumnNames)) } else { None @@ -79,7 +79,11 @@ trait ClickHouseTableV2Base { val tableProperties = deltaProperties if (tableProperties.containsKey(keyName)) { if (tableProperties.get(keyName).nonEmpty) { - val keys = tableProperties.get(keyName).split(",").map(_.trim).toSeq + val keys = tableProperties + .get(keyName) + .split(",") + .map(n => ConverterUtils.normalizeColName(n.trim)) + .toSeq keys.foreach( s => { if (s.contains(".")) { diff --git a/backends-clickhouse/src/main/scala/org/apache/spark/sql/execution/CHColumnarWrite.scala b/backends-clickhouse/src/main/scala/org/apache/spark/sql/execution/CHColumnarWrite.scala new file mode 100644 index 0000000000000..6a5c19a4f9394 --- /dev/null +++ b/backends-clickhouse/src/main/scala/org/apache/spark/sql/execution/CHColumnarWrite.scala @@ -0,0 +1,241 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.sql.execution + +import org.apache.gluten.backendsapi.BackendsApiManager + +import org.apache.spark.TaskContext +import org.apache.spark.internal.Logging +import org.apache.spark.internal.io.{FileCommitProtocol, FileNameSpec, HadoopMapReduceCommitProtocol} +import org.apache.spark.internal.io.FileCommitProtocol.TaskCommitMessage +import org.apache.spark.sql.catalyst.expressions.GenericInternalRow +import org.apache.spark.sql.execution.datasources.{BasicWriteTaskStats, ExecutedWriteSummary, PartitioningUtils, WriteJobDescription, WriteTaskResult} +import org.apache.spark.sql.vectorized.ColumnarBatch +import org.apache.spark.util.Utils + +import org.apache.hadoop.fs.Path +import org.apache.hadoop.mapreduce.{JobID, OutputCommitter, TaskAttemptContext, TaskAttemptID, TaskID, TaskType} +import org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter +import org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl + +import java.lang.reflect.Field + +import scala.collection.mutable + +trait CHColumnarWrite[T <: FileCommitProtocol] { + + def description: WriteJobDescription + def jobTrackerID: String + def committer: T + def doSetupNativeTask(): Unit + + def setupTask(): Unit = { + committer.setupTask(taskAttemptContext) + doSetupNativeTask() + } + + def abortTask(): Unit = { + committer.abortTask(taskAttemptContext) + } + def commitTask(batch: ColumnarBatch): Option[WriteTaskResult] + + lazy val (taskAttemptContext: TaskAttemptContext, jobId: String) = { + // Copied from `SparkHadoopWriterUtils.createJobID` to be compatible with multi-version + def createJobID(jobTrackerID: String, id: Int): JobID = { + if (id < 0) { + throw new IllegalArgumentException("Job number is negative") + } + new JobID(jobTrackerID, id) + } + + val sparkStageId: Int = TaskContext.get().stageId() + val sparkPartitionId: Int = TaskContext.get().partitionId() + val sparkAttemptNumber = TaskContext.get().taskAttemptId().toInt & Int.MaxValue + val jobID = createJobID(jobTrackerID, sparkStageId) + val taskId = new TaskID(jobID, TaskType.MAP, sparkPartitionId) + val taskAttemptId = new TaskAttemptID(taskId, sparkAttemptNumber) + + // Set up the configuration object + val hadoopConf = description.serializableHadoopConf.value + hadoopConf.set("mapreduce.job.id", jobID.toString) + hadoopConf.set("mapreduce.task.id", taskAttemptId.getTaskID.toString) + hadoopConf.set("mapreduce.task.attempt.id", taskAttemptId.toString) + hadoopConf.setBoolean("mapreduce.task.ismap", true) + hadoopConf.setInt("mapreduce.task.partition", 0) + + (new TaskAttemptContextImpl(hadoopConf, taskAttemptId), jobID.toString) + } +} + +object CreateFileNameSpec { + def apply(taskContext: TaskAttemptContext, description: WriteJobDescription): FileNameSpec = { + val fileCounter = 0 + val suffix = f".c$fileCounter%03d" + + description.outputWriterFactory.getFileExtension(taskContext) + FileNameSpec("", suffix) + } +} + +object CreateBasicWriteTaskStats { + def apply( + numFiles: Int, + updatedPartitions: Set[String], + numWrittenRows: Long): BasicWriteTaskStats = { + val partitionsInternalRows = updatedPartitions.map { + part => + val parts = new Array[Any](1) + parts(0) = part + new GenericInternalRow(parts) + }.toSeq + BasicWriteTaskStats( + partitions = partitionsInternalRows, + numFiles = numFiles, + numBytes = 101, + numRows = numWrittenRows) + } +} + +/** [[HadoopMapReduceAdapter]] for [[HadoopMapReduceCommitProtocol]]. */ +case class HadoopMapReduceAdapter(sparkCommitter: HadoopMapReduceCommitProtocol) { + private lazy val committer: OutputCommitter = { + val field: Field = classOf[HadoopMapReduceCommitProtocol].getDeclaredField("committer") + field.setAccessible(true) + field.get(sparkCommitter).asInstanceOf[OutputCommitter] + } + private lazy val GetFilename = { + val m = classOf[HadoopMapReduceCommitProtocol] + .getDeclaredMethod("getFilename", classOf[TaskAttemptContext], classOf[FileNameSpec]) + m.setAccessible(true) + m + } + + private def newTaskAttemptTempPath(defaultPath: String): String = { + assert(committer != null) + val stagingDir: Path = committer match { + // For FileOutputCommitter it has its own staging path called "work path". + case f: FileOutputCommitter => + new Path(Option(f.getWorkPath).map(_.toString).getOrElse(defaultPath)) + case _ => + new Path(defaultPath) + } + stagingDir.toString + } + + private def getFilename(taskContext: TaskAttemptContext, spec: FileNameSpec): String = { + GetFilename.invoke(sparkCommitter, taskContext, spec).asInstanceOf[String] + } + + def getTaskAttemptTempPathAndFilename( + taskContext: TaskAttemptContext, + description: WriteJobDescription): (String, String) = { + val stageDir = newTaskAttemptTempPath(description.path) + val filename = getFilename(taskContext, CreateFileNameSpec(taskContext, description)) + (stageDir, filename) + } +} + +case class HadoopMapReduceCommitProtocolWrite( + override val jobTrackerID: String, + override val description: WriteJobDescription, + override val committer: HadoopMapReduceCommitProtocol) + extends CHColumnarWrite[HadoopMapReduceCommitProtocol] + with Logging { + + private lazy val adapter: HadoopMapReduceAdapter = HadoopMapReduceAdapter(committer) + + /** + * This function is used in [[CHColumnarWriteFilesRDD]] to inject the staging write path before + * initializing the native plan and collect native write files metrics for each backend. + */ + override def doSetupNativeTask(): Unit = { + val (writePath, writeFileName) = + adapter.getTaskAttemptTempPathAndFilename(taskAttemptContext, description) + logDebug(s"Native staging write path: $writePath and file name: $writeFileName") + BackendsApiManager.getIteratorApiInstance.injectWriteFilesTempPath(writePath, writeFileName) + } + + def doCollectNativeResult(cb: ColumnarBatch): Option[WriteTaskResult] = { + val numFiles = cb.numRows() + // Write an empty iterator + if (numFiles == 0) { + None + } else { + val file_col = cb.column(0) + val partition_col = cb.column(1) + val count_col = cb.column(2) + + val outputPath = description.path + val partitions: mutable.Set[String] = mutable.Set[String]() + val addedAbsPathFiles: mutable.Map[String, String] = mutable.Map[String, String]() + + var numWrittenRows: Long = 0 + Range(0, cb.numRows()).foreach { + i => + val targetFileName = file_col.getUTF8String(i).toString + val partition = partition_col.getUTF8String(i).toString + if (partition != "__NO_PARTITION_ID__") { + partitions += partition + val tmpOutputPath = outputPath + "/" + partition + "/" + targetFileName + val customOutputPath = + description.customPartitionLocations.get( + PartitioningUtils.parsePathFragment(partition)) + if (customOutputPath.isDefined) { + addedAbsPathFiles(tmpOutputPath) = customOutputPath.get + "/" + targetFileName + } + } + numWrittenRows += count_col.getLong(i) + } + + val updatedPartitions = partitions.toSet + val summary = + ExecutedWriteSummary( + updatedPartitions = updatedPartitions, + stats = Seq(CreateBasicWriteTaskStats(numFiles, updatedPartitions, numWrittenRows))) + Some( + WriteTaskResult( + new TaskCommitMessage(addedAbsPathFiles.toMap -> updatedPartitions), + summary)) + } + } + + override def commitTask(batch: ColumnarBatch): Option[WriteTaskResult] = { + doCollectNativeResult(batch).map( + nativeWriteTaskResult => { + val (_, taskCommitTime) = Utils.timeTakenMs { + committer.commitTask(taskAttemptContext) + } + + // Just for update task commit time + description.statsTrackers.foreach { + stats => stats.newTaskInstance().getFinalStats(taskCommitTime) + } + nativeWriteTaskResult + }) + } +} + +object CHColumnarWrite { + def apply( + jobTrackerID: String, + description: WriteJobDescription, + committer: FileCommitProtocol): CHColumnarWrite[FileCommitProtocol] = committer match { + case h: HadoopMapReduceCommitProtocol => + HadoopMapReduceCommitProtocolWrite(jobTrackerID, description, h) + .asInstanceOf[CHColumnarWrite[FileCommitProtocol]] + case other => CHDeltaColumnarWrite(jobTrackerID, description, other) + } +} diff --git a/backends-clickhouse/src/main/scala/org/apache/spark/sql/execution/CHColumnarWriteFilesExec.scala b/backends-clickhouse/src/main/scala/org/apache/spark/sql/execution/CHColumnarWriteFilesExec.scala new file mode 100644 index 0000000000000..bf051671fbba2 --- /dev/null +++ b/backends-clickhouse/src/main/scala/org/apache/spark/sql/execution/CHColumnarWriteFilesExec.scala @@ -0,0 +1,162 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.sql.execution + +import org.apache.spark.{Partition, SparkException, TaskContext, TaskOutputFileAlreadyExistException} +import org.apache.spark.internal.io.{FileCommitProtocol, SparkHadoopWriterUtils} +import org.apache.spark.rdd.RDD +import org.apache.spark.shuffle.FetchFailedException +import org.apache.spark.sql.catalyst.catalog.BucketSpec +import org.apache.spark.sql.catalyst.catalog.CatalogTypes.TablePartitionSpec +import org.apache.spark.sql.catalyst.expressions.Attribute +import org.apache.spark.sql.connector.write.WriterCommitMessage +import org.apache.spark.sql.execution.datasources._ +import org.apache.spark.sql.internal.SQLConf +import org.apache.spark.sql.vectorized.ColumnarBatch +import org.apache.spark.util.Utils + +import org.apache.hadoop.fs.FileAlreadyExistsException +import org.apache.hadoop.mapreduce.TaskAttemptContext + +import java.util.Date + +/** + * This RDD is used to make sure we have injected staging write path before initializing the native + * plan, and support Spark file commit protocol. + */ +class CHColumnarWriteFilesRDD( + var prev: RDD[ColumnarBatch], + description: WriteJobDescription, + committer: FileCommitProtocol, + jobTrackerID: String) + extends RDD[WriterCommitMessage](prev) { + + private def reportTaskMetrics(writeTaskResult: WriteTaskResult): Unit = { + val stats = writeTaskResult.summary.stats.head.asInstanceOf[BasicWriteTaskStats] + val (numBytes, numWrittenRows) = (stats.numBytes, stats.numRows) + // Reports bytesWritten and recordsWritten to the Spark output metrics. + // We should update it after calling `commitTask` to overwrite the metrics. + Option(TaskContext.get()).map(_.taskMetrics().outputMetrics).foreach { + outputMetrics => + outputMetrics.setBytesWritten(numBytes) + outputMetrics.setRecordsWritten(numWrittenRows) + } + } + + private def writeFilesForEmptyIterator( + taskAttemptContext: TaskAttemptContext, + sparkPartitionId: Int + ): WriteTaskResult = { + + val dataWriter = + if (sparkPartitionId != 0) { + // In case of empty job, leave first partition to save meta for file format like parquet. + new EmptyDirectoryDataWriter(description, taskAttemptContext, committer) + } else if (description.partitionColumns.isEmpty) { + new SingleDirectoryDataWriter(description, taskAttemptContext, committer) + } else { + new DynamicPartitionDataSingleWriter(description, taskAttemptContext, committer) + } + + // We have done `setupTask` outside + dataWriter.writeWithIterator(Iterator.empty) + dataWriter.commit() + } + + override def compute(split: Partition, context: TaskContext): Iterator[WriterCommitMessage] = { + + val commitProtocol = CHColumnarWrite(jobTrackerID, description, committer) + commitProtocol.setupTask() + + try { + Utils.tryWithSafeFinallyAndFailureCallbacks(block = { + + // Initialize the native plan + val iter = firstParent[ColumnarBatch].iterator(split, context) + assert(iter.hasNext) + val resultColumnarBatch = iter.next() + assert(resultColumnarBatch != null) + val writeTaskResult = commitProtocol + .commitTask(resultColumnarBatch) + .orElse({ + // If we are writing an empty iterator, then gluten backend would do nothing. + // Here we fallback to use vanilla Spark write files to generate an empty file for + // metadata only. + Some(writeFilesForEmptyIterator(commitProtocol.taskAttemptContext, context.partitionId)) + // We have done commit task inside `writeFilesForEmptyIterator`. + }) + .get + reportTaskMetrics(writeTaskResult) + Iterator.single(writeTaskResult) + })( + catchBlock = { + // If there is an error, abort the task + commitProtocol.abortTask() + logError(s"Job ${commitProtocol.jobId} aborted.") + } + ) + } catch { + case e: FetchFailedException => + throw e + case f: FileAlreadyExistsException if SQLConf.get.fastFailFileFormatOutput => + throw new TaskOutputFileAlreadyExistException(f) + case t: Throwable => + throw new SparkException( + s"Task failed while writing rows to output path: ${description.path}", + t) + } + } + + override protected def getPartitions: Array[Partition] = firstParent[ColumnarBatch].partitions + + override def clearDependencies(): Unit = { + super.clearDependencies() + prev = null + } +} + +case class CHColumnarWriteFilesExec( + override val left: SparkPlan, + override val right: SparkPlan, + fileFormat: FileFormat, + partitionColumns: Seq[Attribute], + bucketSpec: Option[BucketSpec], + options: Map[String, String], + staticPartitions: TablePartitionSpec +) extends ColumnarWriteFilesExec(left, right) { + + override protected def withNewChildrenInternal( + newLeft: SparkPlan, + newRight: SparkPlan): SparkPlan = + copy(newLeft, newRight, fileFormat, partitionColumns, bucketSpec, options, staticPartitions) + + override def doExecuteWrite(writeFilesSpec: WriteFilesSpec): RDD[WriterCommitMessage] = { + assert(child.supportsColumnar) + + val rdd = child.executeColumnar() + val jobTrackerID = SparkHadoopWriterUtils.createJobTrackerID(new Date()) + val description = writeFilesSpec.description + val committer = writeFilesSpec.committer + if (rdd.partitions.length == 0) { + // SPARK-23271 If we are attempting to write a zero partition rdd, create a dummy single + // partition rdd to make sure we at least set up one write task to write the metadata. + writeFilesForEmptyRDD(description, committer, jobTrackerID) + } else { + new CHColumnarWriteFilesRDD(rdd, description, committer, jobTrackerID) + } + } +} diff --git a/backends-clickhouse/src/main/scala/org/apache/spark/sql/execution/ClickhouseBackendWrite.scala b/backends-clickhouse/src/main/scala/org/apache/spark/sql/execution/ClickhouseBackendWrite.scala deleted file mode 100644 index 225d9688c7df1..0000000000000 --- a/backends-clickhouse/src/main/scala/org/apache/spark/sql/execution/ClickhouseBackendWrite.scala +++ /dev/null @@ -1,84 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.spark.sql.execution - -import org.apache.spark.internal.Logging -import org.apache.spark.internal.io.FileCommitProtocol.TaskCommitMessage -import org.apache.spark.sql.catalyst.expressions.GenericInternalRow -import org.apache.spark.sql.execution.datasources._ -import org.apache.spark.sql.vectorized.ColumnarBatch - -import scala.collection.mutable - -case class ClickhouseBackendWrite(description: WriteJobDescription) - extends BackendWrite - with Logging { - - override def collectNativeWriteFilesMetrics(cb: ColumnarBatch): Option[WriteTaskResult] = { - val numFiles = cb.numRows() - // Write an empty iterator - if (numFiles == 0) { - None - } else { - val file_col = cb.column(0) - val partition_col = cb.column(1) - val count_col = cb.column(2) - - val outputPath = description.path - var updatedPartitions = Set.empty[String] - val addedAbsPathFiles: mutable.Map[String, String] = mutable.Map[String, String]() - - val write_stats = Range(0, cb.numRows()).map { - i => - val targetFileName = file_col.getUTF8String(i).toString - val partition = partition_col.getUTF8String(i).toString - if (partition != "__NO_PARTITION_ID__") { - updatedPartitions += partition - val tmpOutputPath = outputPath + "/" + partition + "/" + targetFileName - val customOutputPath = - description.customPartitionLocations.get( - PartitioningUtils.parsePathFragment(partition)) - if (customOutputPath.isDefined) { - addedAbsPathFiles(tmpOutputPath) = customOutputPath.get + "/" + targetFileName - } - } - count_col.getLong(i) - } - - val partitionsInternalRows = updatedPartitions.map { - part => - val parts = new Array[Any](1) - parts(0) = part - new GenericInternalRow(parts) - }.toSeq - - val numWrittenRows = write_stats.sum - val stats = BasicWriteTaskStats( - partitions = partitionsInternalRows, - numFiles = numFiles, - numBytes = 101, - numRows = numWrittenRows) - val summary = - ExecutedWriteSummary(updatedPartitions = updatedPartitions, stats = Seq(stats)) - - Some( - WriteTaskResult( - new TaskCommitMessage(addedAbsPathFiles.toMap -> updatedPartitions), - summary)) - } - } -} diff --git a/backends-clickhouse/src/main/scala/org/apache/spark/sql/execution/commands/GlutenCHCacheDataCommand.scala b/backends-clickhouse/src/main/scala/org/apache/spark/sql/execution/commands/GlutenCHCacheDataCommand.scala index 1e6b024063b6d..bb3cb5acce37c 100644 --- a/backends-clickhouse/src/main/scala/org/apache/spark/sql/execution/commands/GlutenCHCacheDataCommand.scala +++ b/backends-clickhouse/src/main/scala/org/apache/spark/sql/execution/commands/GlutenCHCacheDataCommand.scala @@ -16,22 +16,20 @@ */ package org.apache.spark.sql.execution.commands -import org.apache.gluten.exception.GlutenException import org.apache.gluten.expression.ConverterUtils import org.apache.gluten.substrait.rel.ExtensionTableBuilder import org.apache.spark.affinity.CHAffinity import org.apache.spark.rpc.GlutenDriverEndpoint -import org.apache.spark.rpc.GlutenRpcMessages.{CacheLoadResult, GlutenMergeTreeCacheLoad} +import org.apache.spark.rpc.GlutenRpcMessages.{CacheJobInfo, GlutenMergeTreeCacheLoad} import org.apache.spark.sql.{Row, SparkSession} import org.apache.spark.sql.catalyst.TableIdentifier import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference, GreaterThanOrEqual, IsNotNull, Literal} import org.apache.spark.sql.delta._ import org.apache.spark.sql.execution.command.LeafRunnableCommand -import org.apache.spark.sql.execution.commands.GlutenCHCacheDataCommand.toExecutorId +import org.apache.spark.sql.execution.commands.GlutenCacheBase._ import org.apache.spark.sql.execution.datasources.v2.clickhouse.metadata.AddMergeTreeParts import org.apache.spark.sql.types.{BooleanType, StringType} -import org.apache.spark.util.ThreadUtils import org.apache.hadoop.fs.Path @@ -41,7 +39,6 @@ import java.util.{ArrayList => JList} import scala.collection.JavaConverters._ import scala.collection.mutable.ArrayBuffer import scala.concurrent.Future -import scala.concurrent.duration.Duration case class GlutenCHCacheDataCommand( onlyMetaCache: Boolean, @@ -106,7 +103,8 @@ case class GlutenCHCacheDataCommand( } val selectedAddFiles = if (tsfilter.isDefined) { - val allParts = DeltaAdapter.snapshotFilesForScan(snapshot, Seq.empty, Seq.empty, false) + val allParts = + DeltaAdapter.snapshotFilesForScan(snapshot, Seq.empty, Seq.empty, keepNumRecords = false) allParts.files.filter(_.modificationTime >= tsfilter.get.toLong).toSeq } else if (partitionColumn.isDefined && partitionValue.isDefined) { val partitionColumns = snapshot.metadata.partitionSchema.fieldNames @@ -126,18 +124,18 @@ case class GlutenCHCacheDataCommand( snapshot, Seq(partitionColumnAttr), Seq(isNotNullExpr, greaterThanOrEqual), - false) + keepNumRecords = false) .files } else { - DeltaAdapter.snapshotFilesForScan(snapshot, Seq.empty, Seq.empty, false).files + DeltaAdapter + .snapshotFilesForScan(snapshot, Seq.empty, Seq.empty, keepNumRecords = false) + .files } val executorIdsToAddFiles = scala.collection.mutable.Map[String, ArrayBuffer[AddMergeTreeParts]]() val executorIdsToParts = scala.collection.mutable.Map[String, String]() - executorIdsToAddFiles.put( - GlutenCHCacheDataCommand.ALL_EXECUTORS, - new ArrayBuffer[AddMergeTreeParts]()) + executorIdsToAddFiles.put(ALL_EXECUTORS, new ArrayBuffer[AddMergeTreeParts]()) selectedAddFiles.foreach( addFile => { val mergeTreePart = addFile.asInstanceOf[AddMergeTreeParts] @@ -151,9 +149,7 @@ case class GlutenCHCacheDataCommand( if (locations.isEmpty) { // non soft affinity - executorIdsToAddFiles - .get(GlutenCHCacheDataCommand.ALL_EXECUTORS) - .get + executorIdsToAddFiles(ALL_EXECUTORS) .append(mergeTreePart) } else { locations.foreach( @@ -161,7 +157,7 @@ case class GlutenCHCacheDataCommand( if (!executorIdsToAddFiles.contains(executor)) { executorIdsToAddFiles.put(executor, new ArrayBuffer[AddMergeTreeParts]()) } - executorIdsToAddFiles.get(executor).get.append(mergeTreePart) + executorIdsToAddFiles(executor).append(mergeTreePart) }) } }) @@ -201,87 +197,33 @@ case class GlutenCHCacheDataCommand( executorIdsToParts.put(executorId, extensionTableNode.getExtensionTableStr) } }) - - // send rpc call - if (executorIdsToParts.contains(GlutenCHCacheDataCommand.ALL_EXECUTORS)) { + val futureList = ArrayBuffer[(String, Future[CacheJobInfo])]() + if (executorIdsToParts.contains(ALL_EXECUTORS)) { // send all parts to all executors - val tableMessage = executorIdsToParts.get(GlutenCHCacheDataCommand.ALL_EXECUTORS).get - if (asynExecute) { - GlutenDriverEndpoint.executorDataMap.forEach( - (executorId, executor) => { - executor.executorEndpointRef.send( - GlutenMergeTreeCacheLoad(tableMessage, selectedColumns.toSet.asJava)) - }) - Seq(Row(true, "")) - } else { - val futureList = ArrayBuffer[Future[CacheLoadResult]]() - val resultList = ArrayBuffer[CacheLoadResult]() - GlutenDriverEndpoint.executorDataMap.forEach( - (executorId, executor) => { - futureList.append( - executor.executorEndpointRef.ask[CacheLoadResult]( + val tableMessage = executorIdsToParts(ALL_EXECUTORS) + GlutenDriverEndpoint.executorDataMap.forEach( + (executorId, executor) => { + futureList.append( + ( + executorId, + executor.executorEndpointRef.ask[CacheJobInfo]( GlutenMergeTreeCacheLoad(tableMessage, selectedColumns.toSet.asJava) - )) - }) - futureList.foreach( - f => { - resultList.append(ThreadUtils.awaitResult(f, Duration.Inf)) - }) - if (resultList.exists(!_.success)) { - Seq(Row(false, resultList.filter(!_.success).map(_.reason).mkString(";"))) - } else { - Seq(Row(true, "")) - } - } + ))) + }) } else { - if (asynExecute) { - executorIdsToParts.foreach( - value => { - val executorData = GlutenDriverEndpoint.executorDataMap.get(toExecutorId(value._1)) - if (executorData != null) { - executorData.executorEndpointRef.send( - GlutenMergeTreeCacheLoad(value._2, selectedColumns.toSet.asJava)) - } else { - throw new GlutenException( - s"executor ${value._1} not found," + - s" all executors are ${GlutenDriverEndpoint.executorDataMap.toString}") - } - }) - Seq(Row(true, "")) - } else { - val futureList = ArrayBuffer[Future[CacheLoadResult]]() - val resultList = ArrayBuffer[CacheLoadResult]() - executorIdsToParts.foreach( - value => { - val executorData = GlutenDriverEndpoint.executorDataMap.get(toExecutorId(value._1)) - if (executorData != null) { - futureList.append( - executorData.executorEndpointRef.ask[CacheLoadResult]( - GlutenMergeTreeCacheLoad(value._2, selectedColumns.toSet.asJava) - )) - } else { - throw new GlutenException( - s"executor ${value._1} not found," + - s" all executors are ${GlutenDriverEndpoint.executorDataMap.toString}") - } - }) - futureList.foreach( - f => { - resultList.append(ThreadUtils.awaitResult(f, Duration.Inf)) - }) - if (resultList.exists(!_.success)) { - Seq(Row(false, resultList.filter(!_.success).map(_.reason).mkString(";"))) - } else { - Seq(Row(true, "")) - } - } + executorIdsToParts.foreach( + value => { + checkExecutorId(value._1) + val executorData = GlutenDriverEndpoint.executorDataMap.get(toExecutorId(value._1)) + futureList.append( + ( + value._1, + executorData.executorEndpointRef.ask[CacheJobInfo]( + GlutenMergeTreeCacheLoad(value._2, selectedColumns.toSet.asJava) + ))) + }) } - } -} - -object GlutenCHCacheDataCommand { - val ALL_EXECUTORS = "allExecutors" - private def toExecutorId(executorId: String): String = - executorId.split("_").last + getResult(futureList, asynExecute) + } } diff --git a/backends-clickhouse/src/main/scala/org/apache/spark/sql/execution/commands/GlutenCacheBase.scala b/backends-clickhouse/src/main/scala/org/apache/spark/sql/execution/commands/GlutenCacheBase.scala new file mode 100644 index 0000000000000..c4e9f51bce63e --- /dev/null +++ b/backends-clickhouse/src/main/scala/org/apache/spark/sql/execution/commands/GlutenCacheBase.scala @@ -0,0 +1,123 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.sql.execution.commands + +import org.apache.gluten.exception.GlutenException +import org.apache.gluten.execution.CacheResult +import org.apache.gluten.execution.CacheResult.Status + +import org.apache.spark.rpc.GlutenDriverEndpoint +import org.apache.spark.rpc.GlutenRpcMessages.{CacheJobInfo, GlutenCacheLoadStatus} +import org.apache.spark.sql.Row +import org.apache.spark.util.ThreadUtils + +import scala.collection.mutable.ArrayBuffer +import scala.concurrent.Future +import scala.concurrent.duration.Duration + +object GlutenCacheBase { + def ALL_EXECUTORS: String = "allExecutors" + + def toExecutorId(executorId: String): String = + executorId.split("_").last + + protected def waitRpcResults + : ArrayBuffer[(String, Future[CacheJobInfo])] => ArrayBuffer[(String, CacheJobInfo)] = + (futureList: ArrayBuffer[(String, Future[CacheJobInfo])]) => { + val resultList = ArrayBuffer[(String, CacheJobInfo)]() + futureList.foreach( + f => { + resultList.append((f._1, ThreadUtils.awaitResult(f._2, Duration.Inf))) + }) + resultList + } + + def checkExecutorId(executorId: String): Unit = { + if (!GlutenDriverEndpoint.executorDataMap.containsKey(toExecutorId(executorId))) { + throw new GlutenException( + s"executor $executorId not found," + + s" all executors are ${GlutenDriverEndpoint.executorDataMap.toString}") + } + } + + def waitAllJobFinish( + jobs: ArrayBuffer[(String, CacheJobInfo)], + ask: (String, String) => Future[CacheResult]): (Boolean, String) = { + val res = collectJobTriggerResult(jobs) + var status = res._1 + val messages = res._2 + jobs.foreach( + job => { + if (status) { + var complete = false + while (!complete) { + Thread.sleep(5000) + val future_result = ask(job._1, job._2.jobId) + val result = ThreadUtils.awaitResult(future_result, Duration.Inf) + result.getStatus match { + case Status.ERROR => + status = false + messages.append( + s"executor : {}, failed with message: {};", + job._1, + result.getMessage) + complete = true + case Status.SUCCESS => + complete = true + case _ => + // still running + } + } + } + }) + (status, messages.mkString(";")) + } + + def collectJobTriggerResult( + jobs: ArrayBuffer[(String, CacheJobInfo)]): (Boolean, ArrayBuffer[String]) = { + var status = true + val messages = ArrayBuffer[String]() + jobs.foreach( + job => { + if (!job._2.status) { + messages.append(job._2.reason) + status = false + } + }) + (status, messages) + } + + def getResult( + futureList: ArrayBuffer[(String, Future[CacheJobInfo])], + async: Boolean): Seq[Row] = { + val resultList = waitRpcResults(futureList) + if (async) { + val res = collectJobTriggerResult(resultList) + Seq(Row(res._1, res._2.mkString(";"))) + } else { + val fetchStatus: (String, String) => Future[CacheResult] = + (executorId: String, jobId: String) => { + GlutenDriverEndpoint.executorDataMap + .get(toExecutorId(executorId)) + .executorEndpointRef + .ask[CacheResult](GlutenCacheLoadStatus(jobId)) + } + val res = waitAllJobFinish(resultList, fetchStatus) + Seq(Row(res._1, res._2)) + } + } +} diff --git a/backends-clickhouse/src/main/scala/org/apache/spark/sql/execution/commands/GlutenCacheFilesCommand.scala b/backends-clickhouse/src/main/scala/org/apache/spark/sql/execution/commands/GlutenCacheFilesCommand.scala new file mode 100644 index 0000000000000..0a08df7cebade --- /dev/null +++ b/backends-clickhouse/src/main/scala/org/apache/spark/sql/execution/commands/GlutenCacheFilesCommand.scala @@ -0,0 +1,188 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.sql.execution.commands + +import org.apache.gluten.substrait.rel.LocalFilesBuilder +import org.apache.gluten.substrait.rel.LocalFilesNode.ReadFileFormat + +import org.apache.spark.affinity.CHAffinity +import org.apache.spark.rpc.GlutenDriverEndpoint +import org.apache.spark.rpc.GlutenRpcMessages.{CacheJobInfo, GlutenFilesCacheLoad} +import org.apache.spark.sql.{Row, SparkSession} +import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference} +import org.apache.spark.sql.execution.command.LeafRunnableCommand +import org.apache.spark.sql.execution.commands.GlutenCacheBase._ +import org.apache.spark.sql.types.{BooleanType, StringType} + +import org.apache.hadoop.conf.Configuration +import org.apache.hadoop.fs.{FileStatus, FileSystem, Path} + +import java.io.FileNotFoundException +import java.lang.{Long => JLong} +import java.util.{ArrayList => JArrayList, HashMap => JHashMap, Map => JMap} + +import scala.collection.mutable.ArrayBuffer +import scala.concurrent.Future + +case class GlutenCacheFilesCommand( + async: Boolean, + selectedColumn: Option[Seq[String]], + filePath: String, + propertyOverrides: Map[String, String] +) extends LeafRunnableCommand { + + override def output: Seq[Attribute] = Seq( + AttributeReference("result", BooleanType, nullable = false)(), + AttributeReference("reason", StringType, nullable = false)()) + + override def run(session: SparkSession): Seq[Row] = { + val targetFile = new Path(filePath) + val hadoopConf: Configuration = session.sparkContext.hadoopConfiguration + val fs = targetFile.getFileSystem(hadoopConf) + if (!fs.exists(targetFile)) { + throw new FileNotFoundException(filePath) + } + + val recursive = + if ("true".equalsIgnoreCase(propertyOverrides.getOrElse("recursive", "false"))) { + true + } else { + false + } + + val files: Seq[FileStatus] = listFiles(targetFile, recursive, fs) + val executorIdsToFiles = + scala.collection.mutable.Map[String, ArrayBuffer[FileStatus]]() + executorIdsToFiles.put(ALL_EXECUTORS, new ArrayBuffer[FileStatus]()) + + files.foreach( + fileStatus => { + val locations = CHAffinity.getHostLocations(fileStatus.getPath.toUri.toASCIIString) + if (locations.isEmpty) { + executorIdsToFiles(ALL_EXECUTORS).append(fileStatus) + } else { + locations.foreach( + executor => { + if (!executorIdsToFiles.contains(executor)) { + executorIdsToFiles.put(executor, new ArrayBuffer[FileStatus]()) + } + executorIdsToFiles(executor).append(fileStatus) + }) + } + }) + + val executorIdsToLocalFiles = executorIdsToFiles + .filter(_._2.nonEmpty) + .map { + case (executorId, fileStatusArray) => + val paths = new JArrayList[String]() + val starts = new JArrayList[JLong]() + val lengths = new JArrayList[JLong]() + val partitionColumns = new JArrayList[JMap[String, String]] + + fileStatusArray.foreach( + fileStatus => { + paths.add(fileStatus.getPath.toUri.toASCIIString) + starts.add(JLong.valueOf(0)) + lengths.add(JLong.valueOf(fileStatus.getLen)) + partitionColumns.add(new JHashMap[String, String]()) + }) + + val localFile = LocalFilesBuilder.makeLocalFiles( + null, + paths, + starts, + lengths, + lengths, + new JArrayList[JLong](), + partitionColumns, + new JArrayList[JMap[String, String]](), + ReadFileFormat.ParquetReadFormat, // ignore format in backend + new JArrayList[String](), + new JHashMap[String, String]() + ) + + (executorId, localFile) + } + .toMap + + val futureList = ArrayBuffer[(String, Future[CacheJobInfo])]() + val fileNodeOption = executorIdsToLocalFiles.get(ALL_EXECUTORS) + if (fileNodeOption.isDefined) { + GlutenDriverEndpoint.executorDataMap.forEach( + (executorId, executor) => { + futureList.append( + ( + executorId, + executor.executorEndpointRef.ask[CacheJobInfo]( + GlutenFilesCacheLoad(fileNodeOption.get.toProtobuf.toByteArray)))) + }) + } else { + executorIdsToLocalFiles.foreach { + case (executorId, fileNode) => + checkExecutorId(executorId) + val executor = GlutenDriverEndpoint.executorDataMap.get(toExecutorId(executorId)) + futureList.append( + ( + executorId, + executor.executorEndpointRef.ask[CacheJobInfo]( + GlutenFilesCacheLoad(fileNode.toProtobuf.toByteArray)))) + } + } + + getResult(futureList, async) + } + + private def listFiles(targetFile: Path, recursive: Boolean, fs: FileSystem): Seq[FileStatus] = { + val dirContents = fs + .listStatus(targetFile) + .flatMap(f => addInputPathRecursively(fs, f, recursive)) + .filter(isNonEmptyDataFile) + .toSeq + dirContents + } + + private def addInputPathRecursively( + fs: FileSystem, + files: FileStatus, + recursive: Boolean): Seq[FileStatus] = { + if (files.isFile) { + Seq(files) + } else if (recursive) { + fs.listStatus(files.getPath) + .flatMap( + file => { + if (file.isFile) { + Seq(file) + } else { + addInputPathRecursively(fs, file, recursive) + } + }) + } else { + Seq() + } + } + + private def isNonEmptyDataFile(f: FileStatus): Boolean = { + if (!f.isFile || f.getLen == 0) { + false + } else { + val name = f.getPath.getName + !((name.startsWith("_") && !name.contains("=")) || name.startsWith(".")) + } + } +} diff --git a/backends-clickhouse/src/main/scala/org/apache/spark/sql/execution/utils/CHExecUtil.scala b/backends-clickhouse/src/main/scala/org/apache/spark/sql/execution/utils/CHExecUtil.scala index 17eb0ed0b037b..4496d893fcd7c 100644 --- a/backends-clickhouse/src/main/scala/org/apache/spark/sql/execution/utils/CHExecUtil.scala +++ b/backends-clickhouse/src/main/scala/org/apache/spark/sql/execution/utils/CHExecUtil.scala @@ -59,14 +59,16 @@ object CHExecUtil extends Logging { dataSize: SQLMetric, iter: Iterator[ColumnarBatch], compressionCodec: Option[String] = Some("lz4"), + compressionLevel: Option[Int] = None, bufferSize: Int = 4 << 10): Iterator[(Int, Array[Byte])] = { var count = 0 val bos = new ByteArrayOutputStream() val buffer = new Array[Byte](bufferSize) // 4K + val level = compressionLevel.getOrElse(Int.MinValue) val blockOutputStream = compressionCodec - .map(new BlockOutputStream(bos, buffer, dataSize, true, _, bufferSize)) - .getOrElse(new BlockOutputStream(bos, buffer, dataSize, false, "", bufferSize)) + .map(new BlockOutputStream(bos, buffer, dataSize, true, _, level, bufferSize)) + .getOrElse(new BlockOutputStream(bos, buffer, dataSize, false, "", level, bufferSize)) while (iter.hasNext) { val batch = iter.next() count += batch.numRows @@ -127,7 +129,7 @@ object CHExecUtil extends Logging { result } - override def next: UnsafeRow = { + override def next(): UnsafeRow = { if (rowId >= rows) throw new NoSuchElementException val (offset, length) = (rowInfo.offsets(rowId), rowInfo.lengths(rowId)) @@ -317,7 +319,7 @@ object CHExecUtil extends Logging { // Thus in Columnar Shuffle we never use the "key" part. val isOrderSensitive = isRoundRobin && !SQLConf.get.sortBeforeRepartition - val rddWithpartitionKey: RDD[Product2[Int, ColumnarBatch]] = + val rddWithPartitionKey: RDD[Product2[Int, ColumnarBatch]] = if ( GlutenConfig.getConf.isUseColumnarShuffleManager || GlutenConfig.getConf.isUseCelebornShuffleManager @@ -343,7 +345,7 @@ object CHExecUtil extends Logging { val dependency = new ColumnarShuffleDependency[Int, ColumnarBatch, ColumnarBatch]( - rddWithpartitionKey, + rddWithPartitionKey, new PartitionIdPassthrough(newPartitioning.numPartitions), serializer, shuffleWriterProcessor = ShuffleExchangeExec.createShuffleWriteProcessor(writeMetrics), diff --git a/backends-clickhouse/src/test/resources/orc-data/test_reader_time_zone.snappy.orc b/backends-clickhouse/src/test/resources/orc-data/test_reader_time_zone.snappy.orc new file mode 100644 index 0000000000000..ab1b785dbbfc2 Binary files /dev/null and b/backends-clickhouse/src/test/resources/orc-data/test_reader_time_zone.snappy.orc differ diff --git a/backends-clickhouse/src/test/resources/text-data/json-settings/data.txt b/backends-clickhouse/src/test/resources/text-data/json-settings/data.txt index 230b46ec38038..0541ce3469a71 100644 --- a/backends-clickhouse/src/test/resources/text-data/json-settings/data.txt +++ b/backends-clickhouse/src/test/resources/text-data/json-settings/data.txt @@ -1,5 +1,4 @@ {"a":1,"b":2,"c":3} -{"a":"a5", "B":"b6", "c":7} {"a":"4"} {"t":{"ta":"cc","tb":100,"tc":1.234}} {"t":{"ta":"cc","tb":100,"td":"ignore"}} diff --git a/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenClickHouseColumnarShuffleAQESuite.scala b/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenClickHouseColumnarShuffleAQESuite.scala index f25b8643b7070..10e5c7534d352 100644 --- a/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenClickHouseColumnarShuffleAQESuite.scala +++ b/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenClickHouseColumnarShuffleAQESuite.scala @@ -17,16 +17,22 @@ package org.apache.gluten.execution import org.apache.spark.SparkConf +import org.apache.spark.internal.Logging +import org.apache.spark.sql.catalyst.optimizer._ +import org.apache.spark.sql.catalyst.plans._ +import org.apache.spark.sql.execution._ import org.apache.spark.sql.execution.CoalescedPartitionSpec import org.apache.spark.sql.execution.adaptive.{AdaptiveSparkPlanExec, AdaptiveSparkPlanHelper, AQEShuffleReadExec} class GlutenClickHouseColumnarShuffleAQESuite extends GlutenClickHouseTPCHAbstractSuite - with AdaptiveSparkPlanHelper { + with AdaptiveSparkPlanHelper + with Logging { override protected val tablesPath: String = basePath + "/tpch-data-ch" override protected val tpchQueries: String = rootPath + "queries/tpch-queries-ch" override protected val queriesResults: String = rootPath + "mergetree-queries-output" + private val backendConfigPrefix = "spark.gluten.sql.columnar.backend.ch." /** Run Gluten + ClickHouse Backend with ColumnarShuffleManager */ override protected def sparkConf: SparkConf = { @@ -171,4 +177,133 @@ class GlutenClickHouseColumnarShuffleAQESuite assert(adaptiveSparkPlanExec(1) == adaptiveSparkPlanExec(2)) } } + + test("GLUTEN-6768 rerorder hash join") { + withSQLConf( + ("spark.gluten.sql.columnar.backend.ch.enable_reorder_hash_join_tables", "true"), + ("spark.sql.adaptive.enabled", "true")) { + spark.sql("create table t1(a int, b int) using parquet") + spark.sql("create table t2(a int, b int) using parquet") + + spark.sql("insert into t1 select id as a, id as b from range(100000)") + spark.sql("insert into t1 select id as a, id as b from range(100)") + + def isExpectedJoinNode(plan: SparkPlan, joinType: JoinType, buildSide: BuildSide): Boolean = { + plan match { + case join: CHShuffledHashJoinExecTransformer => + join.joinType == joinType && join.buildSide == buildSide + case _ => false + } + } + + def collectExpectedJoinNode( + plan: SparkPlan, + joinType: JoinType, + buildSide: BuildSide): Seq[SparkPlan] = { + if (isExpectedJoinNode(plan, joinType, buildSide)) { + Seq(plan) ++ plan.children.flatMap(collectExpectedJoinNode(_, joinType, buildSide)) + } else { + plan.children.flatMap(collectExpectedJoinNode(_, joinType, buildSide)) + } + } + + var sql = """ + |select * from t2 left join t1 on t1.a = t2.a + |""".stripMargin + compareResultsAgainstVanillaSpark( + sql, + true, + { + df => + val joins = df.queryExecution.executedPlan.collect { + case adpativeNode: AdaptiveSparkPlanExec => + collectExpectedJoinNode(adpativeNode.executedPlan, RightOuter, BuildRight) + case _ => Seq() + } + assert(joins.size == 1) + } + ) + + sql = """ + |select * from t2 right join t1 on t1.a = t2.a + |""".stripMargin + compareResultsAgainstVanillaSpark( + sql, + true, + { + df => + val joins = df.queryExecution.executedPlan.collect { + case adpativeNode: AdaptiveSparkPlanExec => + collectExpectedJoinNode(adpativeNode.executedPlan, LeftOuter, BuildRight) + case _ => Seq() + } + assert(joins.size == 1) + } + ) + + sql = """ + |select * from t1 right join t2 on t1.a = t2.a + |""".stripMargin + compareResultsAgainstVanillaSpark( + sql, + true, + { + df => + val joins = df.queryExecution.executedPlan.collect { + case adpativeNode: AdaptiveSparkPlanExec => + collectExpectedJoinNode(adpativeNode.executedPlan, RightOuter, BuildRight) + case _ => Seq() + } + assert(joins.size == 1) + } + ) + + spark.sql("drop table t1") + spark.sql("drop table t2") + } + } + + test("GLUTEN-6768 change mixed join condition into multi join on clauses") { + withSQLConf( + (backendConfigPrefix + "runtime_config.prefer_multi_join_on_clauses", "true"), + (backendConfigPrefix + "runtime_config.multi_join_on_clauses_build_side_row_limit", "1000000") + ) { + + spark.sql("create table t1(a int, b int, c int, d int) using parquet") + spark.sql("create table t2(a int, b int, c int, d int) using parquet") + + spark.sql(""" + |insert into t1 + |select id % 2 as a, id as b, id + 1 as c, id + 2 as d from range(1000) + |""".stripMargin) + spark.sql(""" + |insert into t2 + |select id % 2 as a, id as b, id + 1 as c, id + 2 as d from range(1000) + |""".stripMargin) + + var sql = """ + |select * from t1 join t2 on + |t1.a = t2.a and (t1.b = t2.b or t1.c = t2.c or t1.d = t2.d) + |order by t1.a, t1.b, t1.c, t1.d + |""".stripMargin + compareResultsAgainstVanillaSpark(sql, true, { _ => }) + + sql = """ + |select * from t1 join t2 on + |t1.a = t2.a and (t1.b = t2.b or t1.c = t2.c or (t1.c = t2.c and t1.d = t2.d)) + |order by t1.a, t1.b, t1.c, t1.d + |""".stripMargin + compareResultsAgainstVanillaSpark(sql, true, { _ => }) + + sql = """ + |select * from t1 join t2 on + |t1.a = t2.a and (t1.b = t2.b or t1.c = t2.c or (t1.d = t2.d and t1.c >= t2.c)) + |order by t1.a, t1.b, t1.c, t1.d + |""".stripMargin + compareResultsAgainstVanillaSpark(sql, true, { _ => }) + + spark.sql("drop table t1") + spark.sql("drop table t2") + } + } } diff --git a/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenClickHouseDeltaParquetWriteSuite.scala b/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenClickHouseDeltaParquetWriteSuite.scala index 8f8351baeae10..d6f9a0162216f 100644 --- a/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenClickHouseDeltaParquetWriteSuite.scala +++ b/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenClickHouseDeltaParquetWriteSuite.scala @@ -1251,7 +1251,7 @@ class GlutenClickHouseDeltaParquetWriteSuite runTPCHQueryBySQL(1, sqlStr) { _ => {} } } - test("test parquet optimize basic") { + testSparkVersionLE33("test parquet optimize basic") { withSQLConf("spark.databricks.delta.optimize.maxFileSize" -> "20000000") { spark.sql(s""" |DROP TABLE IF EXISTS lineitem_delta_parquet_optimize; @@ -1286,7 +1286,7 @@ class GlutenClickHouseDeltaParquetWriteSuite } } - test("test parquet optimize partitioned by one low card column") { + testSparkVersionLE33("test parquet optimize partitioned by one low card column") { spark.sql(s""" |DROP TABLE IF EXISTS lineitem_delta_parquet_optimize_p2; |""".stripMargin) @@ -1325,7 +1325,7 @@ class GlutenClickHouseDeltaParquetWriteSuite assert(ret2.apply(0).get(0) == 600572) } - test("test parquet optimize parallel delete") { + testSparkVersionLE33("test parquet optimize parallel delete") { withSQLConf("spark.databricks.delta.vacuum.parallelDelete.enabled" -> "true") { spark.sql(s""" |DROP TABLE IF EXISTS lineitem_delta_parquet_optimize_p4; @@ -1356,7 +1356,7 @@ class GlutenClickHouseDeltaParquetWriteSuite } } - test("test parquet optimize with the path based table") { + testSparkVersionLE33("test parquet optimize with the path based table") { val dataPath = s"$basePath/lineitem_delta_parquet_optimize_path_based" clearDataPath(dataPath) withSQLConf( @@ -1372,14 +1372,16 @@ class GlutenClickHouseDeltaParquetWriteSuite .mode(SaveMode.Append) .save(dataPath) + assert(countFiles(new File(dataPath)) === 51) + val clickhouseTable = DeltaTable.forPath(spark, dataPath) clickhouseTable.optimize().executeCompaction() clickhouseTable.vacuum(0.0) if (sparkVersion.equals("3.2")) { - assert(countFiles(new File(dataPath)) == 27) + assert(countFiles(new File(dataPath)) === 27) } else { - assert(countFiles(new File(dataPath)) == 29) + assert(countFiles(new File(dataPath)) === 29) } val ret = spark.sql(s"select count(*) from clickhouse.`$dataPath`").collect() diff --git a/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenClickHouseJoinSuite.scala b/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenClickHouseJoinSuite.scala new file mode 100644 index 0000000000000..75c4372a04d9d --- /dev/null +++ b/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenClickHouseJoinSuite.scala @@ -0,0 +1,106 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.gluten.execution + +import org.apache.gluten.GlutenConfig +import org.apache.gluten.utils.UTSystemParameters + +import org.apache.spark.SparkConf + +class GlutenClickHouseJoinSuite extends GlutenClickHouseWholeStageTransformerSuite { + + protected val tablesPath: String = basePath + "/tpch-data" + protected val tpchQueries: String = + rootPath + "../../../../gluten-core/src/test/resources/tpch-queries" + protected val queriesResults: String = rootPath + "queries-output" + + private val joinAlgorithm = "spark.gluten.sql.columnar.backend.ch.runtime_settings.join_algorithm" + + override protected def sparkConf: SparkConf = { + super.sparkConf + .set("spark.sql.files.maxPartitionBytes", "1g") + .set("spark.serializer", "org.apache.spark.serializer.JavaSerializer") + .set("spark.sql.shuffle.partitions", "5") + .set("spark.sql.adaptive.enabled", "false") + .set("spark.sql.files.minPartitionNum", "1") + .set("spark.gluten.sql.columnar.columnartorow", "true") + .set("spark.gluten.sql.columnar.backend.ch.worker.id", "1") + .set(GlutenConfig.GLUTEN_LIB_PATH, UTSystemParameters.clickHouseLibPath) + .set("spark.gluten.sql.columnar.iterator", "true") + .set("spark.gluten.sql.columnar.hashagg.enablefinal", "true") + .set("spark.gluten.sql.enable.native.validation", "false") + .set("spark.sql.warehouse.dir", warehouse) + .set( + "spark.sql.warehouse.dir", + getClass.getResource("/").getPath + "tests-working-home/spark-warehouse") + .set("spark.hive.exec.dynamic.partition.mode", "nonstrict") + .set("spark.shuffle.manager", "sort") + .set("spark.io.compression.codec", "snappy") + .set("spark.sql.shuffle.partitions", "5") + .set("spark.sql.autoBroadcastJoinThreshold", "10MB") + .set(joinAlgorithm, "hash") + .set("spark.sql.autoBroadcastJoinThreshold", "-1") + .setMaster("local[*]") + } + + test("int to long join key rewrite causes column miss match ") { + assert("hash".equalsIgnoreCase(sparkConf.get(joinAlgorithm, "hash"))) + withSQLConf(joinAlgorithm -> "grace_hash") { + withTable("my_customer", "my_store_sales", "my_date_dim") { + sql(""" + |CREATE TABLE my_customer ( + | c_customer_sk INT) + |USING orc + |""".stripMargin) + sql(""" + |CREATE TABLE my_store_sales ( + | ss_sold_date_sk INT, + | ss_customer_sk INT) + | USING orc + |""".stripMargin) + sql(""" + |CREATE TABLE my_date_dim ( + | d_date_sk INT, + | d_year INT, + | d_qoy INT) + |USING orc + |""".stripMargin) + + sql("insert into my_customer values (1), (2), (3), (4)") + sql("insert into my_store_sales values (1, 1), (2, 2), (3, 3), (4, 4)") + sql("insert into my_date_dim values (1, 2002, 1), (2, 2002, 2)") + val q = + """ + |SELECT + | count(*) cnt1 + |FROM + | my_customer c + |WHERE + | exists(SELECT * + | FROM my_store_sales, my_date_dim + | WHERE c.c_customer_sk = ss_customer_sk AND + | ss_sold_date_sk = d_date_sk AND + | d_year = 2002 AND + | d_qoy < 4) + |LIMIT 100 + |""".stripMargin + runQueryAndCompare(q)(checkGlutenOperatorMatch[CHShuffledHashJoinExecTransformer]) + } + } + } + +} diff --git a/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenClickHouseMergeTreeWriteOnS3Suite.scala b/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenClickHouseMergeTreeWriteOnS3Suite.scala index c95b788583229..87e95cbe9dda6 100644 --- a/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenClickHouseMergeTreeWriteOnS3Suite.scala +++ b/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenClickHouseMergeTreeWriteOnS3Suite.scala @@ -188,20 +188,33 @@ class GlutenClickHouseMergeTreeWriteOnS3Suite var metadataGlutenExist: Boolean = false var metadataBinExist: Boolean = false var dataBinExist: Boolean = false + var hasCommits = false client .listObjects(args) .forEach( obj => { objectCount += 1 - if (obj.get().objectName().contains("metadata.gluten")) { + val objectName = obj.get().objectName() + if (objectName.contains("metadata.gluten")) { metadataGlutenExist = true - } else if (obj.get().objectName().contains("meta.bin")) { + } else if (objectName.contains("meta.bin")) { metadataBinExist = true - } else if (obj.get().objectName().contains("data.bin")) { + } else if (objectName.contains("data.bin")) { dataBinExist = true + } else if (objectName.contains("_commits")) { + // Spark 35 has _commits directory + // table/_delta_log/_commits/ + hasCommits = true } }) - assertResult(5)(objectCount) + + if (isSparkVersionGE("3.5")) { + assertResult(6)(objectCount) + assert(hasCommits) + } else { + assertResult(5)(objectCount) + } + assert(metadataGlutenExist) assert(metadataBinExist) assert(dataBinExist) @@ -764,5 +777,48 @@ class GlutenClickHouseMergeTreeWriteOnS3Suite } } } + + test("GLUTEN-6750: Optimize error if file metadata not exist") { + spark.sql(s""" + |DROP TABLE IF EXISTS lineitem_mergetree_bucket_s3; + |""".stripMargin) + + spark.sql(s""" + |CREATE TABLE IF NOT EXISTS lineitem_mergetree_bucket_s3 + |( + | l_orderkey bigint, + | l_partkey bigint, + | l_suppkey bigint, + | l_linenumber bigint, + | l_quantity double, + | l_extendedprice double, + | l_discount double, + | l_tax double, + | l_returnflag string, + | l_linestatus string, + | l_shipdate date, + | l_commitdate date, + | l_receiptdate date, + | l_shipinstruct string, + | l_shipmode string, + | l_comment string + |) + |USING clickhouse + |PARTITIONED BY (l_returnflag) + |CLUSTERED BY (l_orderkey) + |${if (sparkVersion.equals("3.2")) "" else "SORTED BY (l_partkey)"} INTO 4 BUCKETS + |LOCATION 's3a://$BUCKET_NAME/lineitem_mergetree_bucket_s3' + |TBLPROPERTIES (storage_policy='__s3_main') + |""".stripMargin) + + spark.sql(s""" + | insert into table lineitem_mergetree_bucket_s3 + | select /*+ REPARTITION(3) */ * from lineitem + |""".stripMargin) + + FileUtils.deleteDirectory(new File(S3_METADATA_PATH)) + spark.sql("optimize lineitem_mergetree_bucket_s3") + spark.sql("drop table lineitem_mergetree_bucket_s3") + } } // scalastyle:off line.size.limit diff --git a/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenClickHouseWholeStageTransformerSuite.scala b/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenClickHouseWholeStageTransformerSuite.scala index 4972861152fde..dfc5fbd3b37e0 100644 --- a/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenClickHouseWholeStageTransformerSuite.scala +++ b/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenClickHouseWholeStageTransformerSuite.scala @@ -41,6 +41,9 @@ class GlutenClickHouseWholeStageTransformerSuite extends WholeStageTransformerSu version(0) + "." + version(1) } + val CH_CONFIG_PREFIX: String = "spark.gluten.sql.columnar.backend.ch.runtime_config" + val CH_SETTING_PREFIX: String = "spark.gluten.sql.columnar.backend.ch.runtime_settings" + val S3_METADATA_PATH = s"/tmp/metadata/s3/$sparkVersion/" val S3_CACHE_PATH = s"/tmp/s3_cache/$sparkVersion/" val S3_ENDPOINT = "s3://127.0.0.1:9000/" @@ -178,11 +181,13 @@ class GlutenClickHouseWholeStageTransformerSuite extends WholeStageTransformerSu super.beforeAll() } - protected val rootPath: String = this.getClass.getResource("/").getPath - protected val basePath: String = rootPath + "tests-working-home" - protected val warehouse: String = basePath + "/spark-warehouse" - protected val metaStorePathAbsolute: String = basePath + "/meta" - protected val hiveMetaStoreDB: String = metaStorePathAbsolute + "/metastore_db" + final protected val rootPath: String = this.getClass.getResource("/").getPath + final protected val basePath: String = rootPath + "tests-working-home" + final protected val warehouse: String = basePath + "/spark-warehouse" + final protected val metaStorePathAbsolute: String = basePath + "/meta" + + protected val hiveMetaStoreDB: String = + s"$metaStorePathAbsolute/${getClass.getSimpleName}/metastore_db" final override protected val resourcePath: String = "" // ch not need this override protected val fileFormat: String = "parquet" diff --git a/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenClickhouseCountDistinctSuite.scala b/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenClickhouseCountDistinctSuite.scala index 28ff5874fabd1..383681733026b 100644 --- a/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenClickhouseCountDistinctSuite.scala +++ b/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenClickhouseCountDistinctSuite.scala @@ -16,7 +16,8 @@ */ package org.apache.gluten.execution -import org.apache.gluten.execution.AllDataTypesWithComplexType.genTestData +import org.apache.gluten.test.AllDataTypesWithComplexType +import org.apache.gluten.test.AllDataTypesWithComplexType.genTestData import org.apache.spark.SparkConf class GlutenClickhouseCountDistinctSuite extends GlutenClickHouseWholeStageTransformerSuite { diff --git a/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenFunctionValidateSuite.scala b/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenFunctionValidateSuite.scala index d3e3e94460369..45485ac90e1af 100644 --- a/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenFunctionValidateSuite.scala +++ b/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenFunctionValidateSuite.scala @@ -730,4 +730,29 @@ class GlutenFunctionValidateSuite extends GlutenClickHouseWholeStageTransformerS runQueryAndCompare(aggregate_sql)(checkGlutenOperatorMatch[ProjectExecTransformer]) } } + + test("test issue: https://github.com/apache/incubator-gluten/issues/6561") { + val sql = """ + |select + | map_from_arrays( + | transform(map_keys(map('t1',id,'t2',id+1)), v->v), + | array('a','b')) as b from range(10) + |""".stripMargin + runQueryAndCompare(sql)(checkGlutenOperatorMatch[ProjectExecTransformer]) + } + + test("test function format_string") { + val sql = """ + | SELECT + | format_string( + | 'hello world %d %d %s %f', + | id, + | id, + | CAST(id AS STRING), + | CAST(id AS float) + | ) + |FROM range(10) + |""".stripMargin + runQueryAndCompare(sql)(checkGlutenOperatorMatch[ProjectExecTransformer]) + } } diff --git a/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenClickhouseFunctionSuite.scala b/backends-clickhouse/src/test/scala/org/apache/gluten/execution/compatibility/GlutenClickhouseFunctionSuite.scala similarity index 52% rename from backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenClickhouseFunctionSuite.scala rename to backends-clickhouse/src/test/scala/org/apache/gluten/execution/compatibility/GlutenClickhouseFunctionSuite.scala index 8853dfc77853d..11d5290c0d0e4 100644 --- a/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenClickhouseFunctionSuite.scala +++ b/backends-clickhouse/src/test/scala/org/apache/gluten/execution/compatibility/GlutenClickhouseFunctionSuite.scala @@ -14,18 +14,13 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.gluten.execution +package org.apache.gluten.execution.compatibility import org.apache.gluten.GlutenConfig +import org.apache.gluten.execution.GlutenClickHouseTPCHAbstractSuite import org.apache.gluten.utils.UTSystemParameters import org.apache.spark.SparkConf -import org.apache.spark.sql.SparkSession -import org.apache.spark.sql.delta.DeltaLog - -import org.apache.commons.io.FileUtils - -import java.io.File class GlutenClickhouseFunctionSuite extends GlutenClickHouseTPCHAbstractSuite { override protected val needCopyParquetToTablePath = true @@ -39,9 +34,6 @@ class GlutenClickhouseFunctionSuite extends GlutenClickHouseTPCHAbstractSuite { createNotNullTPCHTablesInParquet(tablesPath) } - private var _hiveSpark: SparkSession = _ - override protected def spark: SparkSession = _hiveSpark - override protected def sparkConf: SparkConf = { new SparkConf() .set("spark.plugins", "org.apache.gluten.GlutenPlugin") @@ -69,73 +61,36 @@ class GlutenClickhouseFunctionSuite extends GlutenClickHouseTPCHAbstractSuite { .setMaster("local[1]") } - override protected def initializeSession(): Unit = { - if (_hiveSpark == null) { - val hiveMetaStoreDB = metaStorePathAbsolute + "/metastore_db" - _hiveSpark = SparkSession - .builder() - .config(sparkConf) - .enableHiveSupport() - .config( - "javax.jdo.option.ConnectionURL", - s"jdbc:derby:;databaseName=$hiveMetaStoreDB;create=true") - .getOrCreate() - } - } - - override def beforeAll(): Unit = { - // prepare working paths - val basePathDir = new File(basePath) - if (basePathDir.exists()) { - FileUtils.forceDelete(basePathDir) - } - FileUtils.forceMkdir(basePathDir) - FileUtils.forceMkdir(new File(warehouse)) - FileUtils.forceMkdir(new File(metaStorePathAbsolute)) - FileUtils.copyDirectory(new File(rootPath + resourcePath), new File(tablesPath)) - super.beforeAll() - } - - override protected def afterAll(): Unit = { - DeltaLog.clearCache() - - try { - super.afterAll() - } finally { - try { - if (_hiveSpark != null) { - try { - _hiveSpark.sessionState.catalog.reset() - } finally { - _hiveSpark.stop() - _hiveSpark = null - } - } - } finally { - SparkSession.clearActiveSession() - SparkSession.clearDefaultSession() - } - } - } - test("test uuid - write and read") { withSQLConf( ("spark.gluten.sql.native.writer.enabled", "true"), (GlutenConfig.GLUTEN_ENABLED.key, "true")) { + withTable("uuid_test") { + spark.sql("create table if not exists uuid_test (id string) using parquet") - spark.sql("drop table if exists uuid_test") - spark.sql("create table if not exists uuid_test (id string) stored as parquet") - - val df = spark.sql("select regexp_replace(uuid(), '-', '') as id from range(1)") - df.cache() - df.write.insertInto("uuid_test") + val df = spark.sql("select regexp_replace(uuid(), '-', '') as id from range(1)") + df.cache() + df.write.insertInto("uuid_test") - val df2 = spark.table("uuid_test") - val diffCount = df.exceptAll(df2).count() - assert(diffCount == 0) + val df2 = spark.table("uuid_test") + val diffCount = df.exceptAll(df2).count() + assert(diffCount == 0) + } } } + test("https://github.com/apache/incubator-gluten/issues/6938") { + val testSQL = + s""" + |select * from ( + | select 1 as x, r_name as y, 's' as z from region + | union all + | select 2 as x, n_name as y, null as z from nation + |) order by y,x,z + |""".stripMargin + runQueryAndCompare(testSQL)(_ => ()) + } + test("Support In list option contains non-foldable expression") { runQueryAndCompare( """ @@ -181,49 +136,98 @@ class GlutenClickhouseFunctionSuite extends GlutenClickHouseTPCHAbstractSuite { } test("GLUTEN-5981 null value from get_json_object") { - spark.sql("create table json_t1 (a string) using parquet") - spark.sql("insert into json_t1 values ('{\"a\":null}')") - runQueryAndCompare( - """ - |SELECT get_json_object(a, '$.a') is null from json_t1 - |""".stripMargin - )(df => checkFallbackOperators(df, 0)) - spark.sql("drop table json_t1") + withTable("json_t1") { + spark.sql("create table json_t1 (a string) using parquet") + spark.sql("insert into json_t1 values ('{\"a\":null}')") + runQueryAndCompare( + """ + |SELECT get_json_object(a, '$.a') is null from json_t1 + |""".stripMargin + )(df => checkFallbackOperators(df, 0)) + } } test("Fix arrayDistinct(Array(Nullable(Decimal))) core dump") { - val create_sql = - """ - |create table if not exists test( - | dec array - |) using parquet - |""".stripMargin - val fill_sql = - """ - |insert into test values(array(1, 2, null)), (array(null, 2,3, 5)) - |""".stripMargin - val query_sql = - """ - |select array_distinct(dec) from test; - |""".stripMargin - spark.sql(create_sql) - spark.sql(fill_sql) - compareResultsAgainstVanillaSpark(query_sql, true, { _ => }) - spark.sql("drop table test") + withTable("json_t1") { + val create_sql = + """ + |create table if not exists test( + | dec array + |) using parquet + |""".stripMargin + val fill_sql = + """ + |insert into test values(array(1, 2, null)), (array(null, 2,3, 5)) + |""".stripMargin + val query_sql = + """ + |select array_distinct(dec) from test; + |""".stripMargin + spark.sql(create_sql) + spark.sql(fill_sql) + compareResultsAgainstVanillaSpark(query_sql, true, { _ => }) + } } test("intersect all") { - spark.sql("create table t1 (a int, b string) using parquet") - spark.sql("insert into t1 values (1, '1'),(2, '2'),(3, '3'),(4, '4'),(5, '5'),(6, '6')") - spark.sql("create table t2 (a int, b string) using parquet") - spark.sql("insert into t2 values (4, '4'),(5, '5'),(6, '6'),(7, '7'),(8, '8'),(9, '9')") - runQueryAndCompare( - """ - |SELECT a,b FROM t1 INTERSECT ALL SELECT a,b FROM t2 - |""".stripMargin - )(df => checkFallbackOperators(df, 0)) - spark.sql("drop table t1") - spark.sql("drop table t2") + withTable("t1", "t2") { + spark.sql("create table t1 (a int, b string) using parquet") + spark.sql("insert into t1 values (1, '1'),(2, '2'),(3, '3'),(4, '4'),(5, '5'),(6, '6')") + spark.sql("create table t2 (a int, b string) using parquet") + spark.sql("insert into t2 values (4, '4'),(5, '5'),(6, '6'),(7, '7'),(8, '8'),(9, '9')") + runQueryAndCompare( + """ + |SELECT a,b FROM t1 INTERSECT ALL SELECT a,b FROM t2 + |""".stripMargin + )(df => checkFallbackOperators(df, 0)) + } + } + + test("array decimal32 CH column to row") { + compareResultsAgainstVanillaSpark("SELECT array(1.0, 2.0)", true, { _ => }, false) + compareResultsAgainstVanillaSpark("SELECT map(1.0, '2', 3.0, '4')", true, { _ => }, false) + } + + test("array decimal32 spark row to CH column") { + withTable("test_array_decimal") { + sql(""" + |create table test_array_decimal(val array) + |using parquet + |""".stripMargin) + sql(""" + |insert into test_array_decimal + |values array(1.0, 2.0), array(3.0, 4.0), + |array(5.0, 6.0), array(7.0, 8.0), array(7.0, 7.0) + |""".stripMargin) + // disable native scan so will get a spark row to CH column + withSQLConf(GlutenConfig.COLUMNAR_FILESCAN_ENABLED.key -> "false") { + val q = "SELECT max(val) from test_array_decimal" + compareResultsAgainstVanillaSpark(q, true, { _ => }, false) + val q2 = "SELECT max(val[0]) from test_array_decimal" + compareResultsAgainstVanillaSpark(q2, true, { _ => }, false) + val q3 = "SELECT max(val[1]) from test_array_decimal" + compareResultsAgainstVanillaSpark(q3, true, { _ => }, false) + } + } } + test("duplicate column name issue") { + withTable("left_table", "right_table") { + sql("create table left_table(id int, name string) using orc") + sql("create table right_table(id int, book string) using orc") + sql("insert into left_table values (1,'a'),(2,'b'),(3,'c'),(4,'d')") + sql("insert into right_table values (1,'a'),(1,'b'),(2,'c'),(2,'d')") + compareResultsAgainstVanillaSpark( + """ + |select p1.id, p1.name, p2.book + | from left_table p1 left join + | (select id, id, book + | from right_table where id <= 2) p2 + | on p1.id=p2.id + |""".stripMargin, + true, + { _ => } + ) + } + } } diff --git a/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenClickhouseStringFunctionsSuite.scala b/backends-clickhouse/src/test/scala/org/apache/gluten/execution/compatibility/GlutenClickhouseStringFunctionsSuite.scala similarity index 97% rename from backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenClickhouseStringFunctionsSuite.scala rename to backends-clickhouse/src/test/scala/org/apache/gluten/execution/compatibility/GlutenClickhouseStringFunctionsSuite.scala index 163a8fedab7ea..98c0c2b35f202 100644 --- a/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenClickhouseStringFunctionsSuite.scala +++ b/backends-clickhouse/src/test/scala/org/apache/gluten/execution/compatibility/GlutenClickhouseStringFunctionsSuite.scala @@ -14,7 +14,9 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.gluten.execution +package org.apache.gluten.execution.compatibility + +import org.apache.gluten.execution.GlutenClickHouseWholeStageTransformerSuite import org.apache.spark.SparkConf diff --git a/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenClickHouseHiveTableSuite.scala b/backends-clickhouse/src/test/scala/org/apache/gluten/execution/hive/GlutenClickHouseHiveTableSuite.scala similarity index 94% rename from backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenClickHouseHiveTableSuite.scala rename to backends-clickhouse/src/test/scala/org/apache/gluten/execution/hive/GlutenClickHouseHiveTableSuite.scala index 8599b3002e3af..cc91556133433 100644 --- a/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenClickHouseHiveTableSuite.scala +++ b/backends-clickhouse/src/test/scala/org/apache/gluten/execution/hive/GlutenClickHouseHiveTableSuite.scala @@ -14,13 +14,15 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.gluten.execution +package org.apache.gluten.execution.hive import org.apache.gluten.GlutenConfig +import org.apache.gluten.execution.{GlutenClickHouseWholeStageTransformerSuite, ProjectExecTransformer, TransformSupport} +import org.apache.gluten.test.AllDataTypesWithComplexType import org.apache.gluten.utils.UTSystemParameters import org.apache.spark.SparkConf -import org.apache.spark.sql.{DataFrame, SparkSession} +import org.apache.spark.sql.{DataFrame, SaveMode} import org.apache.spark.sql.delta.DeltaLog import org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanHelper import org.apache.spark.sql.hive.HiveTableScanExecTransformer @@ -29,64 +31,14 @@ import org.apache.spark.sql.internal.SQLConf import org.apache.hadoop.fs.Path import java.io.{File, PrintWriter} -import java.sql.{Date, Timestamp} import scala.reflect.ClassTag -case class AllDataTypesWithComplexType( - string_field: String = null, - int_field: java.lang.Integer = null, - long_field: java.lang.Long = null, - float_field: java.lang.Float = null, - double_field: java.lang.Double = null, - short_field: java.lang.Short = null, - byte_field: java.lang.Byte = null, - boolean_field: java.lang.Boolean = null, - decimal_field: java.math.BigDecimal = null, - date_field: java.sql.Date = null, - timestamp_field: java.sql.Timestamp = null, - array: Seq[Int] = null, - arrayContainsNull: Seq[Option[Int]] = null, - map: Map[Int, Long] = null, - mapValueContainsNull: Map[Int, Option[Long]] = null -) - -object AllDataTypesWithComplexType { - def genTestData(): Seq[AllDataTypesWithComplexType] = { - (0 to 199).map { - i => - if (i % 100 == 1) { - AllDataTypesWithComplexType() - } else { - AllDataTypesWithComplexType( - s"$i", - i, - i.toLong, - i.toFloat, - i.toDouble, - i.toShort, - i.toByte, - i % 2 == 0, - new java.math.BigDecimal(i + ".56"), - Date.valueOf(new Date(System.currentTimeMillis()).toLocalDate.plusDays(i % 10)), - Timestamp.valueOf( - new Timestamp(System.currentTimeMillis()).toLocalDateTime.plusDays(i % 10)), - Seq.apply(i + 1, i + 2, i + 3), - Seq.apply(Option.apply(i + 1), Option.empty, Option.apply(i + 3)), - Map.apply((i + 1, i + 2), (i + 3, i + 4)), - Map.empty - ) - } - } - } -} - class GlutenClickHouseHiveTableSuite extends GlutenClickHouseWholeStageTransformerSuite + with ReCreateHiveSession with AdaptiveSparkPlanHelper { - private var _hiveSpark: SparkSession = _ - override protected def sparkConf: SparkConf = { new SparkConf() .set("spark.plugins", "org.apache.gluten.GlutenPlugin") @@ -112,25 +64,13 @@ class GlutenClickHouseHiveTableSuite .set("spark.hive.exec.dynamic.partition.mode", "nonstrict") .set("spark.gluten.supported.hive.udfs", "my_add") .set("spark.gluten.sql.columnar.backend.ch.runtime_config.use_local_format", "true") + .set("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") + .set( + "spark.sql.catalog.spark_catalog", + "org.apache.spark.sql.execution.datasources.v2.clickhouse.ClickHouseSparkCatalog") .setMaster("local[*]") } - override protected def spark: SparkSession = _hiveSpark - - override protected def initializeSession(): Unit = { - if (_hiveSpark == null) { - val hiveMetaStoreDB = metaStorePathAbsolute + "/metastore_db" - _hiveSpark = SparkSession - .builder() - .config(sparkConf) - .enableHiveSupport() - .config( - "javax.jdo.option.ConnectionURL", - s"jdbc:derby:;databaseName=$hiveMetaStoreDB;create=true") - .getOrCreate() - } - } - private val txt_table_name = "hive_txt_test" private val txt_user_define_input = "hive_txt_user_define_input" private val json_table_name = "hive_json_test" @@ -231,24 +171,7 @@ class GlutenClickHouseHiveTableSuite override protected def afterAll(): Unit = { DeltaLog.clearCache() - - try { - super.afterAll() - } finally { - try { - if (_hiveSpark != null) { - try { - _hiveSpark.sessionState.catalog.reset() - } finally { - _hiveSpark.stop() - _hiveSpark = null - } - } - } finally { - SparkSession.clearActiveSession() - SparkSession.clearDefaultSession() - } - } + super.afterAll() } test("test hive text table") { @@ -953,7 +876,7 @@ class GlutenClickHouseHiveTableSuite val select_sql_4 = "select id, get_json_object(data, '$.v111') from test_tbl_3337" val select_sql_5 = "select id, get_json_object(data, 'v112') from test_tbl_3337" val select_sql_6 = - "select id, get_json_object(data, '$.id') from test_tbl_3337 where id = 123"; + "select id, get_json_object(data, '$.id') from test_tbl_3337 where id = 123" compareResultsAgainstVanillaSpark(select_sql_1, compareResult = true, _ => {}) compareResultsAgainstVanillaSpark(select_sql_2, compareResult = true, _ => {}) compareResultsAgainstVanillaSpark(select_sql_3, compareResult = true, _ => {}) @@ -1281,4 +1204,59 @@ class GlutenClickHouseHiveTableSuite compareResultsAgainstVanillaSpark(select_sql, true, { _ => }) sql(s"drop table if exists $tbl") } + + test("test mergetree write with column case sensitive on hive") { + val dataPath = s"$basePath/lineitem_mergetree_bucket" + val sourceDF = spark.sql(s""" + |select + | string_field, + | int_field, + | long_field + | from $txt_user_define_input + |""".stripMargin) + + sourceDF.write + .format("clickhouse") + .option("clickhouse.numBuckets", "1") + .option("clickhouse.bucketColumnNames", "STRING_FIELD") + .mode(SaveMode.Overwrite) + .save(dataPath) + } + + test("GLUTEN-6506: Orc read time zone") { + val dataPath = s"$basePath/orc-data/test_reader_time_zone.snappy.orc" + val create_table_sql = ("create table test_tbl_6506(" + + "id bigint, t timestamp) stored as orc location '%s'") + .format(dataPath) + val select_sql = "select * from test_tbl_6506" + spark.sql(create_table_sql) + compareResultsAgainstVanillaSpark(select_sql, compareResult = true, _ => {}) + spark.sql("drop table test_tbl_6506") + } + + test("GLUTEN-6879: Fix partition value diff when it contains blanks") { + val tableName = "test_tbl_6879" + sql(s"drop table if exists $tableName") + + val createSql = + s""" + |CREATE TABLE $tableName ( + | id INT, + | name STRING + |) PARTITIONED BY (part STRING) + |STORED AS PARQUET; + |""".stripMargin + sql(createSql) + + val insertSql = + s""" + |INSERT INTO $tableName PARTITION (part='part with spaces') + |VALUES (1, 'John Doe'); + |""".stripMargin + sql(insertSql) + + val selectSql = s"SELECT * FROM $tableName" + compareResultsAgainstVanillaSpark(selectSql, true, _ => {}) + sql(s"drop table if exists $tableName") + } } diff --git a/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenClickHouseNativeWriteTableSuite.scala b/backends-clickhouse/src/test/scala/org/apache/gluten/execution/hive/GlutenClickHouseNativeWriteTableSuite.scala similarity index 91% rename from backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenClickHouseNativeWriteTableSuite.scala rename to backends-clickhouse/src/test/scala/org/apache/gluten/execution/hive/GlutenClickHouseNativeWriteTableSuite.scala index 1f99947e5b96f..9e3fa00787de9 100644 --- a/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenClickHouseNativeWriteTableSuite.scala +++ b/backends-clickhouse/src/test/scala/org/apache/gluten/execution/hive/GlutenClickHouseNativeWriteTableSuite.scala @@ -14,34 +14,32 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.gluten.execution +package org.apache.gluten.execution.hive import org.apache.gluten.GlutenConfig -import org.apache.gluten.execution.AllDataTypesWithComplexType.genTestData +import org.apache.gluten.execution.GlutenClickHouseWholeStageTransformerSuite +import org.apache.gluten.test.AllDataTypesWithComplexType.genTestData import org.apache.gluten.utils.UTSystemParameters import org.apache.spark.SparkConf import org.apache.spark.gluten.NativeWriteChecker -import org.apache.spark.sql.SparkSession import org.apache.spark.sql.delta.DeltaLog import org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanHelper -import org.apache.spark.sql.test.SharedSparkSession -import org.apache.spark.sql.types.{DecimalType, LongType, StringType, StructField, StructType} - -import org.scalatest.BeforeAndAfterAll +import org.apache.spark.sql.types._ import scala.reflect.runtime.universe.TypeTag class GlutenClickHouseNativeWriteTableSuite extends GlutenClickHouseWholeStageTransformerSuite with AdaptiveSparkPlanHelper - with SharedSparkSession - with BeforeAndAfterAll + with ReCreateHiveSession with NativeWriteChecker { - private var _hiveSpark: SparkSession = _ - override protected def sparkConf: SparkConf = { + var sessionTimeZone = "GMT" + if (isSparkVersionGE("3.5")) { + sessionTimeZone = java.util.TimeZone.getDefault.getID + } new SparkConf() .set("spark.plugins", "org.apache.gluten.GlutenPlugin") .set("spark.memory.offHeap.enabled", "true") @@ -65,6 +63,7 @@ class GlutenClickHouseNativeWriteTableSuite // TODO: support default ANSI policy .set("spark.sql.storeAssignmentPolicy", "legacy") .set("spark.sql.warehouse.dir", getWarehouseDir) + .set("spark.sql.session.timeZone", sessionTimeZone) .set("spark.gluten.sql.columnar.backend.ch.runtime_config.logger.level", "error") .setMaster("local[1]") } @@ -75,45 +74,12 @@ class GlutenClickHouseNativeWriteTableSuite basePath + "/中文/spark-warehouse" } - override protected def spark: SparkSession = _hiveSpark - - override protected def initializeSession(): Unit = { - if (_hiveSpark == null) { - val hiveMetaStoreDB = metaStorePathAbsolute + "/metastore_db" - _hiveSpark = SparkSession - .builder() - .config(sparkConf) - .enableHiveSupport() - .config( - "javax.jdo.option.ConnectionURL", - s"jdbc:derby:;databaseName=$hiveMetaStoreDB;create=true") - .getOrCreate() - } - } - private val table_name_template = "hive_%s_test" private val table_name_vanilla_template = "hive_%s_test_written_by_vanilla" override protected def afterAll(): Unit = { DeltaLog.clearCache() - - try { - super.afterAll() - } finally { - try { - if (_hiveSpark != null) { - try { - _hiveSpark.sessionState.catalog.reset() - } finally { - _hiveSpark.stop() - _hiveSpark = null - } - } - } finally { - SparkSession.clearActiveSession() - SparkSession.clearDefaultSession() - } - } + super.afterAll() } def getColumnName(s: String): String = { @@ -187,7 +153,7 @@ class GlutenClickHouseNativeWriteTableSuite checkNative: Boolean = true): Unit = nativeWrite { format => val (table_name, table_create_sql, insert_sql) = f(format) - withDestinationTable(table_name, table_create_sql) { + withDestinationTable(table_name, Option(table_create_sql)) { checkInsertQuery(insert_sql, checkNative) Option(extraCheck).foreach(_(table_name, format)) } @@ -218,15 +184,36 @@ class GlutenClickHouseNativeWriteTableSuite } test("supplier: csv to parquet- insert overwrite local directory") { + val partitionNumber = 7 withSource(supplierDF, "supplier") { - nativeWrite { - format => + nativeWrite2( + format => { val sql = s"""insert overwrite local directory |'$basePath/test_insert_into_${format}_supplier' - |stored as $format select * from supplier""".stripMargin - checkInsertQuery(sql, checkNative = true) - } + |stored as $format + |select /*+ REPARTITION($partitionNumber) */ * from supplier""".stripMargin + (s"test_insert_into_${format}_supplier", null, sql) + }, + (table_name, format) => { + // spark 3.2 without orc or parquet suffix + val files = recursiveListFiles(new File(s"$basePath/$table_name")) + .map(_.getName) + .filterNot(s => s.endsWith(s".crc") || s.equals("_SUCCESS")) + + lazy val fileNames = { + val dir = s"$basePath/$table_name" + recursiveListFiles(new File(dir)) + .map(f => f.getAbsolutePath.stripPrefix(dir)) + .sorted + .mkString("\n") + } + + lazy val errorMessage = + s"Search $basePath/$table_name with suffix .$format, all files: \n $fileNames" + assert(files.length === partitionNumber, errorMessage) + } + ) } } @@ -602,20 +589,12 @@ class GlutenClickHouseNativeWriteTableSuite ("date_field", "date"), ("timestamp_field", "timestamp") ) - def excludeTimeFieldForORC(format: String): Seq[String] = { - if (format.equals("orc") && isSparkVersionGE("3.5")) { - // FIXME:https://github.com/apache/incubator-gluten/pull/6507 - fields.keys.filterNot(_.equals("timestamp_field")).toSeq - } else { - fields.keys.toSeq - } - } val origin_table = "origin_table" withSource(genTestData(), origin_table) { nativeWrite { format => val table_name = table_name_template.format(format) - val testFields = excludeTimeFieldForORC(format) + val testFields = fields.keys.toSeq writeAndCheckRead(origin_table, table_name, testFields, isSparkVersionLE("3.3")) { fields => spark @@ -851,7 +830,7 @@ class GlutenClickHouseNativeWriteTableSuite val table_name = "t_" + format withDestinationTable( table_name, - s"create table $table_name (id int, str string) stored as $format") { + Some(s"create table $table_name (id int, str string) stored as $format")) { checkInsertQuery( s"insert overwrite table $table_name " + "select id, cast(id as string) from range(10) union all " + @@ -919,4 +898,37 @@ class GlutenClickHouseNativeWriteTableSuite _ => {}) ) } + + test("GLUTEN-2584: fix native write and read mismatch about complex types") { + def table(format: String): String = s"t_$format" + def create(format: String, table_name: Option[String] = None): String = + s"""CREATE TABLE ${table_name.getOrElse(table(format))}( + | id INT, + | info STRUCT, + | data MAP, + | values ARRAY + |) stored as $format""".stripMargin + def insert(format: String, table_name: Option[String] = None): String = + s"""INSERT overwrite ${table_name.getOrElse(table(format))} VALUES + | (6, null, null, null); + """.stripMargin + + nativeWrite2( + format => (table(format), create(format), insert(format)), + (table_name, format) => { + val vanilla_table = s"${table_name}_v" + val vanilla_create = create(format, Some(vanilla_table)) + vanillaWrite { + withDestinationTable(vanilla_table, Option(vanilla_create)) { + checkInsertQuery(insert(format, Some(vanilla_table)), checkNative = false) + } + } + val rowsFromOriginTable = + spark.sql(s"select * from $vanilla_table").collect() + val dfFromWriteTable = + spark.sql(s"select * from $table_name") + checkAnswer(dfFromWriteTable, rowsFromOriginTable) + } + ) + } } diff --git a/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenClickHouseTableAfterRestart.scala b/backends-clickhouse/src/test/scala/org/apache/gluten/execution/hive/GlutenClickHouseTableAfterRestart.scala similarity index 87% rename from backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenClickHouseTableAfterRestart.scala rename to backends-clickhouse/src/test/scala/org/apache/gluten/execution/hive/GlutenClickHouseTableAfterRestart.scala index f9e831cb4aa7b..d359428d03cab 100644 --- a/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenClickHouseTableAfterRestart.scala +++ b/backends-clickhouse/src/test/scala/org/apache/gluten/execution/hive/GlutenClickHouseTableAfterRestart.scala @@ -14,12 +14,14 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.gluten.execution +package org.apache.gluten.execution.hive + +import org.apache.gluten.execution.GlutenClickHouseTPCHAbstractSuite import org.apache.spark.SparkConf import org.apache.spark.sql.SparkSession import org.apache.spark.sql.SparkSession.{getActiveSession, getDefaultSession} -import org.apache.spark.sql.delta.{ClickhouseSnapshot, DeltaLog} +import org.apache.spark.sql.delta.ClickhouseSnapshot import org.apache.spark.sql.delta.catalog.ClickHouseTableV2 import org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanHelper @@ -33,7 +35,8 @@ import java.io.File // This suite is to make sure clickhouse commands works well even after spark restart class GlutenClickHouseTableAfterRestart extends GlutenClickHouseTPCHAbstractSuite - with AdaptiveSparkPlanHelper { + with AdaptiveSparkPlanHelper + with ReCreateHiveSession { override protected val needCopyParquetToTablePath = true @@ -64,56 +67,18 @@ class GlutenClickHouseTableAfterRestart .set( "spark.gluten.sql.columnar.backend.ch.runtime_settings.input_format_parquet_max_block_size", "8192") + .setMaster("local[2]") } override protected def createTPCHNotNullTables(): Unit = { createNotNullTPCHTablesInParquet(tablesPath) } - private var _hiveSpark: SparkSession = _ - override protected def spark: SparkSession = _hiveSpark - - override protected def initializeSession(): Unit = { - if (_hiveSpark == null) { - val hiveMetaStoreDB = metaStorePathAbsolute + "/metastore_db_" + current_db_num - current_db_num += 1 - - _hiveSpark = SparkSession - .builder() - .config(sparkConf) - .enableHiveSupport() - .config( - "javax.jdo.option.ConnectionURL", - s"jdbc:derby:;databaseName=$hiveMetaStoreDB;create=true") - .master("local[2]") - .getOrCreate() - } - } - - override protected def afterAll(): Unit = { - DeltaLog.clearCache() - - try { - super.afterAll() - } finally { - try { - if (_hiveSpark != null) { - try { - _hiveSpark.sessionState.catalog.reset() - } finally { - _hiveSpark.stop() - _hiveSpark = null - } - } - } finally { - SparkSession.clearActiveSession() - SparkSession.clearDefaultSession() - } - } - } - var current_db_num: Int = 0 + override protected val hiveMetaStoreDB: String = + metaStorePathAbsolute + "/metastore_db_" + current_db_num + test("test mergetree after restart") { spark.sql(s""" |DROP TABLE IF EXISTS lineitem_mergetree; @@ -347,22 +312,22 @@ class GlutenClickHouseTableAfterRestart SparkSession.clearDefaultSession() } - val hiveMetaStoreDB = metaStorePathAbsolute + "/metastore_db_" + val metaStoreDB = metaStorePathAbsolute + "/metastore_db_" // use metastore_db2 to avoid issue: "Another instance of Derby may have already booted the database" - val destDir = new File(hiveMetaStoreDB + current_db_num) - destDir.mkdirs() - FileUtils.copyDirectory(new File(hiveMetaStoreDB + (current_db_num - 1)), destDir) - _hiveSpark = null - _hiveSpark = SparkSession - .builder() - .config(sparkConf) - .enableHiveSupport() - .config( - "javax.jdo.option.ConnectionURL", - s"jdbc:derby:;databaseName=$hiveMetaStoreDB$current_db_num") - .master("local[2]") - .getOrCreate() current_db_num += 1 + val destDir = new File(metaStoreDB + current_db_num) + destDir.mkdirs() + FileUtils.copyDirectory(new File(metaStoreDB + (current_db_num - 1)), destDir) + updateHiveSession( + SparkSession + .builder() + .config(sparkConf) + .enableHiveSupport() + .config( + "javax.jdo.option.ConnectionURL", + s"jdbc:derby:;databaseName=$metaStoreDB$current_db_num") + .getOrCreate() + ) } } // scalastyle:off line.size.limit diff --git a/backends-clickhouse/src/test/scala/org/apache/gluten/execution/hive/ReCreateHiveSession.scala b/backends-clickhouse/src/test/scala/org/apache/gluten/execution/hive/ReCreateHiveSession.scala new file mode 100644 index 0000000000000..c251e46364f5d --- /dev/null +++ b/backends-clickhouse/src/test/scala/org/apache/gluten/execution/hive/ReCreateHiveSession.scala @@ -0,0 +1,69 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.gluten.execution.hive + +import org.apache.spark.sql.SparkSession +import org.apache.spark.sql.test.SharedSparkSession + +import org.scalatest.BeforeAndAfterAll + +trait ReCreateHiveSession extends SharedSparkSession with BeforeAndAfterAll { + + protected val hiveMetaStoreDB: String + + private var _hiveSpark: SparkSession = _ + + override protected def spark: SparkSession = _hiveSpark + + override protected def initializeSession(): Unit = { + if (_hiveSpark == null) { + _hiveSpark = SparkSession + .builder() + .config(sparkConf) + .enableHiveSupport() + .config( + "javax.jdo.option.ConnectionURL", + s"jdbc:derby:;databaseName=$hiveMetaStoreDB;create=true") + .getOrCreate() + } + } + + override protected def afterAll(): Unit = { + try { + super.afterAll() + } finally { + try { + if (_hiveSpark != null) { + try { + _hiveSpark.sessionState.catalog.reset() + } finally { + _hiveSpark.stop() + _hiveSpark = null + } + } + } finally { + SparkSession.clearActiveSession() + SparkSession.clearDefaultSession() + } + } + } + + protected def updateHiveSession(newSession: SparkSession): Unit = { + _hiveSpark = null + _hiveSpark = newSession + } +} diff --git a/backends-clickhouse/src/test/scala/org/apache/gluten/execution/parquet/GlutenParquetFilterSuite.scala b/backends-clickhouse/src/test/scala/org/apache/gluten/execution/parquet/GlutenParquetFilterSuite.scala index b4e4cea9173b6..a2f897d378114 100644 --- a/backends-clickhouse/src/test/scala/org/apache/gluten/execution/parquet/GlutenParquetFilterSuite.scala +++ b/backends-clickhouse/src/test/scala/org/apache/gluten/execution/parquet/GlutenParquetFilterSuite.scala @@ -17,13 +17,12 @@ package org.apache.gluten.execution.parquet import org.apache.gluten.execution.{FileSourceScanExecTransformer, GlutenClickHouseWholeStageTransformerSuite} -import org.apache.gluten.test.GlutenSQLTestUtils +import org.apache.gluten.test.{GlutenSQLTestUtils, GlutenTPCHBase} import org.apache.spark.SparkConf import org.apache.spark.internal.Logging import org.apache.spark.sql.catalyst.dsl.expressions._ import org.apache.spark.sql.catalyst.expressions._ -import org.apache.spark.sql.gluten.test.GlutenTPCHBase import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types.Decimal @@ -35,12 +34,6 @@ class GlutenParquetFilterSuite with GlutenTPCHBase with Logging { - override protected val rootPath = this.getClass.getResource("/").getPath - override protected val basePath = rootPath + "tests-working-home" - override protected val warehouse = basePath + "/spark-warehouse" - override protected val metaStorePathAbsolute = basePath + "/meta" - override protected val hiveMetaStoreDB = metaStorePathAbsolute + "/metastore_db" - private val tpchQueriesResourceFolder: String = rootPath + "../../../../gluten-core/src/test/resources/tpch-queries" @@ -391,13 +384,13 @@ class GlutenParquetFilterSuite 'p_size.int >= 1, 'p_partkey.long.isNotNull, ('p_brand.string === "Brand#12" && - ('p_container.string in ("SM CASE", "SM BOX", "SM PACK", "SM PKG")) && + 'p_container.string.in("SM CASE", "SM BOX", "SM PACK", "SM PKG") && 'p_size.int <= 5) || ('p_brand.string === "Brand#23" && - ('p_container.string in ("MED BAG", "MED BOX", "MED PKG", "MED PACK")) && + 'p_container.string.in("MED BAG", "MED BOX", "MED PKG", "MED PACK") && 'p_size.int <= 10) || ('p_brand.string === "Brand#34" && - ('p_container.string in ("LG CASE", "LG BOX", "LG PACK", "LG PKG")) && + 'p_container.string.in("LG CASE", "LG BOX", "LG PACK", "LG PKG") && 'p_size.int <= 15) ) ), diff --git a/backends-clickhouse/src/test/scala/org/apache/gluten/execution/tpch/GlutenClickHouseHDFSSuite.scala b/backends-clickhouse/src/test/scala/org/apache/gluten/execution/tpch/GlutenClickHouseHDFSSuite.scala new file mode 100644 index 0000000000000..fbea2ed464d79 --- /dev/null +++ b/backends-clickhouse/src/test/scala/org/apache/gluten/execution/tpch/GlutenClickHouseHDFSSuite.scala @@ -0,0 +1,133 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.gluten.execution.tpch + +import org.apache.gluten.execution.{CHNativeCacheManager, FileSourceScanExecTransformer, GlutenClickHouseTPCHAbstractSuite} + +import org.apache.spark.SparkConf +import org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanHelper + +import org.apache.hadoop.fs.Path + +class GlutenClickHouseHDFSSuite + extends GlutenClickHouseTPCHAbstractSuite + with AdaptiveSparkPlanHelper { + + override protected val tablesPath: String = HDFS_URL_ENDPOINT + "/tpch-data" + override protected val tpchQueries: String = + rootPath + "../../../../gluten-core/src/test/resources/tpch-queries" + override protected val queriesResults: String = rootPath + "queries-output" + + private val hdfsCachePath = "/tmp/gluten_hdfs_cache/" + private val cache_name = "gluten_cache" + + /** Run Gluten + ClickHouse Backend with SortShuffleManager */ + override protected def sparkConf: SparkConf = { + super.sparkConf + .set("spark.shuffle.manager", "sort") + .set("spark.io.compression.codec", "snappy") + .set("spark.sql.shuffle.partitions", "5") + .set("spark.sql.autoBroadcastJoinThreshold", "10MB") + .set("spark.sql.adaptive.enabled", "true") + .set(s"$CH_CONFIG_PREFIX.use_local_format", "true") + .set("spark.gluten.sql.columnar.backend.ch.shuffle.hash.algorithm", "sparkMurmurHash3_32") + .set(s"$CH_CONFIG_PREFIX.gluten_cache.local.enabled", "true") + .set(s"$CH_CONFIG_PREFIX.gluten_cache.local.name", cache_name) + .set(s"$CH_CONFIG_PREFIX.gluten_cache.local.path", hdfsCachePath) + .set(s"$CH_CONFIG_PREFIX.gluten_cache.local.max_size", "10Gi") + .set(s"$CH_CONFIG_PREFIX.reuse_disk_cache", "false") + .set("spark.sql.adaptive.enabled", "false") + } + + override protected def createTPCHNotNullTables(): Unit = { + createNotNullTPCHTablesInParquet(tablesPath) + } + + override def beforeAll(): Unit = { + super.beforeAll() + } + + override protected def beforeEach(): Unit = { + super.beforeEach() + deleteCache() + } + + private def deleteCache(): Unit = { + val targetFile = new Path(tablesPath) + val fs = targetFile.getFileSystem(spark.sessionState.newHadoopConf()) + fs.listStatus(targetFile) + .foreach( + table => { + if (table.isDirectory) { + fs.listStatus(table.getPath) + .foreach( + data => { + if (data.isFile) { + CHNativeCacheManager + .removeFiles(data.getPath.toUri.getPath.substring(1), cache_name) + } + }) + } + }) + clearDataPath(hdfsCachePath) + } + + val runWithoutCache: () => Unit = () => { + runTPCHQuery(6) { + df => + val plans = df.queryExecution.executedPlan.collect { + case scanExec: FileSourceScanExecTransformer => scanExec + } + assert(plans.size == 1) + assert(plans.head.metrics("readMissBytes").value != 0) + } + } + + val runWithCache: () => Unit = () => { + runTPCHQuery(6) { + df => + val plans = df.queryExecution.executedPlan.collect { + case scanExec: FileSourceScanExecTransformer => scanExec + } + assert(plans.size == 1) + assert(plans.head.metrics("readMissBytes").value == 0) + assert(plans.head.metrics("readCacheBytes").value != 0) + } + } + + ignore("test hdfs cache") { + runWithoutCache() + runWithCache() + } + + ignore("test cache file command") { + runSql( + s"CACHE FILES select * from '$HDFS_URL_ENDPOINT/tpch-data/lineitem'", + noFallBack = false) { _ => } + runWithCache() + } + + ignore("test no cache by query") { + withSQLConf( + s"$CH_SETTING_PREFIX.read_from_filesystem_cache_if_exists_otherwise_bypass_cache" -> "true") { + runWithoutCache() + } + + runWithoutCache() + runWithCache() + } +} diff --git a/backends-clickhouse/src/test/scala/org/apache/gluten/test/AllDataTypesWithComplexType.scala b/backends-clickhouse/src/test/scala/org/apache/gluten/test/AllDataTypesWithComplexType.scala new file mode 100644 index 0000000000000..19abcbea433a1 --- /dev/null +++ b/backends-clickhouse/src/test/scala/org/apache/gluten/test/AllDataTypesWithComplexType.scala @@ -0,0 +1,67 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.gluten.test + +import java.sql.{Date, Timestamp} + +case class AllDataTypesWithComplexType( + string_field: String = null, + int_field: java.lang.Integer = null, + long_field: java.lang.Long = null, + float_field: java.lang.Float = null, + double_field: java.lang.Double = null, + short_field: java.lang.Short = null, + byte_field: java.lang.Byte = null, + boolean_field: java.lang.Boolean = null, + decimal_field: java.math.BigDecimal = null, + date_field: java.sql.Date = null, + timestamp_field: java.sql.Timestamp = null, + array: Seq[Int] = null, + arrayContainsNull: Seq[Option[Int]] = null, + map: Map[Int, Long] = null, + mapValueContainsNull: Map[Int, Option[Long]] = null +) + +object AllDataTypesWithComplexType { + def genTestData(): Seq[AllDataTypesWithComplexType] = { + (0 to 199).map { + i => + if (i % 100 == 1) { + AllDataTypesWithComplexType() + } else { + AllDataTypesWithComplexType( + s"$i", + i, + i.toLong, + i.toFloat, + i.toDouble, + i.toShort, + i.toByte, + i % 2 == 0, + new java.math.BigDecimal(i + ".56"), + Date.valueOf(new Date(System.currentTimeMillis()).toLocalDate.plusDays(i % 10)), + Timestamp.valueOf( + new Timestamp(System.currentTimeMillis()).toLocalDateTime.plusDays(i % 10)), + Seq.apply(i + 1, i + 2, i + 3), + Seq.apply(Option.apply(i + 1), Option.empty, Option.apply(i + 3)), + Map.apply((i + 1, i + 2), (i + 3, i + 4)), + Map.empty + ) + } + } + } +} diff --git a/backends-clickhouse/src/test/scala/org/apache/gluten/test/GlutenTPCHBase.scala b/backends-clickhouse/src/test/scala/org/apache/gluten/test/GlutenTPCHBase.scala index 685f185ac81ff..5a07ebab15b51 100644 --- a/backends-clickhouse/src/test/scala/org/apache/gluten/test/GlutenTPCHBase.scala +++ b/backends-clickhouse/src/test/scala/org/apache/gluten/test/GlutenTPCHBase.scala @@ -14,9 +14,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.spark.sql.gluten.test - -import org.apache.gluten.test.GlutenTPCBase +package org.apache.gluten.test import org.apache.spark.sql.catalyst.TableIdentifier @@ -51,7 +49,11 @@ trait GlutenTPCHBase extends GlutenTPCBase { override def dropTables(): Unit = { tpchCreateTable.keys.foreach { - tableName => spark.sessionState.catalog.dropTable(TableIdentifier(tableName), true, true) + tableName => + spark.sessionState.catalog.dropTable( + TableIdentifier(tableName), + ignoreIfNotExists = true, + purge = true) } } diff --git a/backends-clickhouse/src/test/scala/org/apache/spark/gluten/NativeWriteChecker.scala b/backends-clickhouse/src/test/scala/org/apache/spark/gluten/NativeWriteChecker.scala index fc30d151b675d..49e368c888e78 100644 --- a/backends-clickhouse/src/test/scala/org/apache/spark/gluten/NativeWriteChecker.scala +++ b/backends-clickhouse/src/test/scala/org/apache/spark/gluten/NativeWriteChecker.scala @@ -62,9 +62,10 @@ trait NativeWriteChecker spark.sql(sqlStr) } - def withDestinationTable(table: String, createTableSql: String = "select 1")(f: => Unit): Unit = { + def withDestinationTable(table: String, createTableSql: Option[String] = None)( + f: => Unit): Unit = { spark.sql(s"drop table IF EXISTS $table") - spark.sql(s"$createTableSql") + createTableSql.foreach(spark.sql) f } @@ -74,6 +75,12 @@ trait NativeWriteChecker } } + def vanillaWrite(block: => Unit): Unit = { + withSQLConf(("spark.gluten.sql.native.writer.enabled", "false")) { + block + } + } + def withSource(df: Dataset[Row], viewName: String, pairs: (String, String)*)( block: => Unit): Unit = { withSQLConf(pairs: _*) { diff --git a/backends-clickhouse/src/test/scala/org/apache/spark/sql/execution/benchmarks/CHHashBuildBenchmark.scala b/backends-clickhouse/src/test/scala/org/apache/spark/sql/execution/benchmarks/CHHashBuildBenchmark.scala index 141bf5eea5cb9..87c389a651c35 100644 --- a/backends-clickhouse/src/test/scala/org/apache/spark/sql/execution/benchmarks/CHHashBuildBenchmark.scala +++ b/backends-clickhouse/src/test/scala/org/apache/spark/sql/execution/benchmarks/CHHashBuildBenchmark.scala @@ -104,7 +104,7 @@ object CHHashBuildBenchmark extends SqlBasedBenchmark with CHSqlBasedBenchmark w ( countsAndBytes.flatMap(_._2), countsAndBytes.map(_._1).sum, - BroadCastHashJoinContext(Seq(child.output.head), Inner, false, false, child.output, "") + BroadCastHashJoinContext(Seq(child.output.head), Inner, true, false, false, child.output, "") ) } } diff --git a/backends-clickhouse/src/test/scala/org/apache/spark/sql/execution/benchmarks/CHStorageJoinBenchmark.scala b/backends-clickhouse/src/test/scala/org/apache/spark/sql/execution/benchmarks/CHStorageJoinBenchmark.scala index 194eccc50878a..322c9521e74e2 100644 --- a/backends-clickhouse/src/test/scala/org/apache/spark/sql/execution/benchmarks/CHStorageJoinBenchmark.scala +++ b/backends-clickhouse/src/test/scala/org/apache/spark/sql/execution/benchmarks/CHStorageJoinBenchmark.scala @@ -97,7 +97,7 @@ object CHStorageJoinBenchmark extends SqlBasedBenchmark with CHSqlBasedBenchmark _numRows += batch.numRows } Iterator((_numRows, blockNativeWriter.collectAsByteArray())) - // Iterator((_numRows, new Array[Byte](0))) + // Iterator((_numRows, new Array[Byte](0))) } .collect val count0 = countsAndBytes.map(_._1).sum @@ -191,7 +191,8 @@ object CHStorageJoinBenchmark extends SqlBasedBenchmark with CHSqlBasedBenchmark batch => val bos = new ByteArrayOutputStream() val buffer = new Array[Byte](4 << 10) // 4K - val dout = new BlockOutputStream(bos, buffer, dataSize, true, "lz4", buffer.length) + val dout = + new BlockOutputStream(bos, buffer, dataSize, true, "lz4", Int.MinValue, buffer.length) dout.write(batch) dout.flush() dout.close() diff --git a/backends-velox/src/main/java/org/apache/gluten/utils/VeloxBatchResizer.java b/backends-velox/src/main/java/org/apache/gluten/utils/VeloxBatchResizer.java index e2f11cd3510b8..2a6dcb43a0528 100644 --- a/backends-velox/src/main/java/org/apache/gluten/utils/VeloxBatchResizer.java +++ b/backends-velox/src/main/java/org/apache/gluten/utils/VeloxBatchResizer.java @@ -16,8 +16,8 @@ */ package org.apache.gluten.utils; -import org.apache.gluten.exec.Runtime; -import org.apache.gluten.exec.Runtimes; +import org.apache.gluten.runtime.Runtime; +import org.apache.gluten.runtime.Runtimes; import org.apache.gluten.vectorized.ColumnarBatchInIterator; import org.apache.gluten.vectorized.ColumnarBatchOutIterator; diff --git a/backends-velox/src/main/java/org/apache/gluten/utils/VeloxBatchResizerJniWrapper.java b/backends-velox/src/main/java/org/apache/gluten/utils/VeloxBatchResizerJniWrapper.java index 3011ced2ab379..8855dd2acbbda 100644 --- a/backends-velox/src/main/java/org/apache/gluten/utils/VeloxBatchResizerJniWrapper.java +++ b/backends-velox/src/main/java/org/apache/gluten/utils/VeloxBatchResizerJniWrapper.java @@ -16,8 +16,8 @@ */ package org.apache.gluten.utils; -import org.apache.gluten.exec.Runtime; -import org.apache.gluten.exec.RuntimeAware; +import org.apache.gluten.runtime.Runtime; +import org.apache.gluten.runtime.RuntimeAware; import org.apache.gluten.vectorized.ColumnarBatchInIterator; public class VeloxBatchResizerJniWrapper implements RuntimeAware { diff --git a/backends-velox/src/main/java/org/apache/gluten/utils/VeloxBloomFilter.java b/backends-velox/src/main/java/org/apache/gluten/utils/VeloxBloomFilter.java index f23426d7da9dd..10179d63edb33 100644 --- a/backends-velox/src/main/java/org/apache/gluten/utils/VeloxBloomFilter.java +++ b/backends-velox/src/main/java/org/apache/gluten/utils/VeloxBloomFilter.java @@ -16,7 +16,7 @@ */ package org.apache.gluten.utils; -import org.apache.gluten.exec.Runtimes; +import org.apache.gluten.runtime.Runtimes; import org.apache.commons.io.IOUtils; import org.apache.spark.util.sketch.BloomFilter; diff --git a/backends-velox/src/main/java/org/apache/gluten/utils/VeloxBloomFilterJniWrapper.java b/backends-velox/src/main/java/org/apache/gluten/utils/VeloxBloomFilterJniWrapper.java index 3ddfd2c02ed8c..d5ee784150867 100644 --- a/backends-velox/src/main/java/org/apache/gluten/utils/VeloxBloomFilterJniWrapper.java +++ b/backends-velox/src/main/java/org/apache/gluten/utils/VeloxBloomFilterJniWrapper.java @@ -16,8 +16,8 @@ */ package org.apache.gluten.utils; -import org.apache.gluten.exec.Runtime; -import org.apache.gluten.exec.RuntimeAware; +import org.apache.gluten.runtime.Runtime; +import org.apache.gluten.runtime.RuntimeAware; public class VeloxBloomFilterJniWrapper implements RuntimeAware { private final Runtime runtime; diff --git a/gluten-core/src/main/java/org/apache/gluten/substrait/ddlplan/DllPlanNode.java b/backends-velox/src/main/java/org/apache/gluten/utils/VeloxFileSystemValidationJniWrapper.java similarity index 73% rename from gluten-core/src/main/java/org/apache/gluten/substrait/ddlplan/DllPlanNode.java rename to backends-velox/src/main/java/org/apache/gluten/utils/VeloxFileSystemValidationJniWrapper.java index 80c57eaa8cb17..8e9bd3f03d29e 100644 --- a/gluten-core/src/main/java/org/apache/gluten/substrait/ddlplan/DllPlanNode.java +++ b/backends-velox/src/main/java/org/apache/gluten/utils/VeloxFileSystemValidationJniWrapper.java @@ -14,16 +14,9 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.gluten.substrait.ddlplan; +package org.apache.gluten.utils; -import io.substrait.proto.DllPlan; +public class VeloxFileSystemValidationJniWrapper { -/** Contains helper functions for constructing substrait relations. */ -public interface DllPlanNode { - /** - * Converts a Expression into a protobuf. - * - * @return A rel protobuf - */ - DllPlan toProtobuf(); + public static native boolean allSupportedByRegisteredFileSystems(String[] paths); } diff --git a/backends-velox/src/main/scala/org/apache/gluten/backendsapi/velox/VeloxBackend.scala b/backends-velox/src/main/scala/org/apache/gluten/backendsapi/velox/VeloxBackend.scala index 0eb6126876b55..065adf338c191 100644 --- a/backends-velox/src/main/scala/org/apache/gluten/backendsapi/velox/VeloxBackend.scala +++ b/backends-velox/src/main/scala/org/apache/gluten/backendsapi/velox/VeloxBackend.scala @@ -25,14 +25,13 @@ import org.apache.gluten.extension.ValidationResult import org.apache.gluten.sql.shims.SparkShimLoader import org.apache.gluten.substrait.rel.LocalFilesNode.ReadFileFormat import org.apache.gluten.substrait.rel.LocalFilesNode.ReadFileFormat.{DwrfReadFormat, OrcReadFormat, ParquetReadFormat} +import org.apache.gluten.utils._ import org.apache.spark.sql.catalyst.catalog.BucketSpec -import org.apache.spark.sql.catalyst.expressions.{Alias, CumeDist, DenseRank, Descending, EulerNumber, Expression, Lag, Lead, Literal, MakeYMInterval, NamedExpression, NthValue, NTile, PercentRank, Pi, Rand, RangeFrame, Rank, RowNumber, SortOrder, SparkPartitionID, SparkVersion, SpecialFrameBoundary, SpecifiedWindowFrame, Uuid} -import org.apache.spark.sql.catalyst.expressions.aggregate.{AggregateExpression, ApproximatePercentile, Count, Sum} +import org.apache.spark.sql.catalyst.expressions.{Alias, CumeDist, DenseRank, Descending, Expression, Lag, Lead, NamedExpression, NthValue, NTile, PercentRank, RangeFrame, Rank, RowNumber, SortOrder, SpecialFrameBoundary, SpecifiedWindowFrame} +import org.apache.spark.sql.catalyst.expressions.aggregate.{AggregateExpression, ApproximatePercentile} import org.apache.spark.sql.catalyst.plans.{JoinType, LeftOuter, RightOuter} import org.apache.spark.sql.catalyst.util.CharVarcharUtils -import org.apache.spark.sql.execution.{ProjectExec, SparkPlan} -import org.apache.spark.sql.execution.aggregate.HashAggregateExec import org.apache.spark.sql.execution.command.CreateDataSourceTableAsSelectCommand import org.apache.spark.sql.execution.datasources.{FileFormat, InsertIntoHadoopFsRelationCommand} import org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat @@ -40,6 +39,8 @@ import org.apache.spark.sql.hive.execution.HiveFileFormat import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types._ +import org.apache.hadoop.fs.Path + import scala.util.control.Breaks.breakable class VeloxBackend extends Backend { @@ -52,6 +53,7 @@ class VeloxBackend extends Backend { override def validatorApi(): ValidatorApi = new VeloxValidatorApi override def metricsApi(): MetricsApi = new VeloxMetricsApi override def listenerApi(): ListenerApi = new VeloxListenerApi + override def ruleApi(): RuleApi = new VeloxRuleApi override def settings(): BackendSettingsApi = VeloxBackendSettings } @@ -66,14 +68,24 @@ object VeloxBackendSettings extends BackendSettingsApi { val GLUTEN_VELOX_UDF_LIB_PATHS = getBackendConfigPrefix() + ".udfLibraryPaths" val GLUTEN_VELOX_DRIVER_UDF_LIB_PATHS = getBackendConfigPrefix() + ".driver.udfLibraryPaths" val GLUTEN_VELOX_INTERNAL_UDF_LIB_PATHS = getBackendConfigPrefix() + ".internal.udfLibraryPaths" + val GLUTEN_VELOX_UDF_ALLOW_TYPE_CONVERSION = getBackendConfigPrefix() + ".udfAllowTypeConversion" val MAXIMUM_BATCH_SIZE: Int = 32768 - override def supportFileFormatRead( + override def validateScan( format: ReadFileFormat, fields: Array[StructField], partTable: Boolean, + rootPaths: Seq[String], paths: Seq[String]): ValidationResult = { + val filteredRootPaths = distinctRootPaths(rootPaths) + if ( + !filteredRootPaths.isEmpty && !VeloxFileSystemValidationJniWrapper + .allSupportedByRegisteredFileSystems(filteredRootPaths.toArray) + ) { + return ValidationResult.failed( + s"Scheme of [$filteredRootPaths] is not supported by registered file systems.") + } // Validate if all types are supported. def validateTypes(validatorFunc: PartialFunction[StructField, String]): ValidationResult = { // Collect unsupported types. @@ -178,6 +190,17 @@ object VeloxBackendSettings extends BackendSettingsApi { .isDefined } + def distinctRootPaths(paths: Seq[String]): Seq[String] = { + // Skip native validation for local path, as local file system is always registered. + // For evey file scheme, only one path is kept. + paths + .map(p => (new Path(p).toUri.getScheme, p)) + .groupBy(_._1) + .filter(_._1 != "file") + .map(_._2.head._2) + .toSeq + } + override def supportWriteFilesExec( format: FileFormat, fields: Array[StructField], @@ -418,49 +441,6 @@ object VeloxBackendSettings extends BackendSettingsApi { } } - /** - * Check whether a plan needs to be offloaded even though they have empty input schema, e.g, - * Sum(1), Count(1), rand(), etc. - * @param plan: - * The Spark plan to check. - */ - private def mayNeedOffload(plan: SparkPlan): Boolean = { - def checkExpr(expr: Expression): Boolean = { - expr match { - // Block directly falling back the below functions by FallbackEmptySchemaRelation. - case alias: Alias => checkExpr(alias.child) - case _: Rand | _: Uuid | _: MakeYMInterval | _: SparkPartitionID | _: EulerNumber | _: Pi | - _: SparkVersion => - true - case _ => false - } - } - - plan match { - case exec: HashAggregateExec if exec.aggregateExpressions.nonEmpty => - // Check Sum(Literal) or Count(Literal). - exec.aggregateExpressions.forall( - expression => { - val aggFunction = expression.aggregateFunction - aggFunction match { - case Sum(Literal(_, _), _) => true - case Count(Seq(Literal(_, _))) => true - case _ => false - } - }) - case p: ProjectExec if p.projectList.nonEmpty => - p.projectList.forall(checkExpr(_)) - case _ => - false - } - } - - override def fallbackOnEmptySchema(plan: SparkPlan): Boolean = { - // Count(1) and Sum(1) are special cases that Velox backend can handle. - // Do not fallback it and its children in the first place. - !mayNeedOffload(plan) - } - override def fallbackAggregateWithEmptyOutputChild(): Boolean = true override def recreateJoinExecOnFallback(): Boolean = true diff --git a/backends-velox/src/main/scala/org/apache/gluten/backendsapi/velox/VeloxListenerApi.scala b/backends-velox/src/main/scala/org/apache/gluten/backendsapi/velox/VeloxListenerApi.scala index 86925fd1d6a85..2cfc4e9a9099d 100644 --- a/backends-velox/src/main/scala/org/apache/gluten/backendsapi/velox/VeloxListenerApi.scala +++ b/backends-velox/src/main/scala/org/apache/gluten/backendsapi/velox/VeloxListenerApi.scala @@ -18,7 +18,6 @@ package org.apache.gluten.backendsapi.velox import org.apache.gluten.GlutenConfig import org.apache.gluten.backendsapi.ListenerApi -import org.apache.gluten.exception.GlutenException import org.apache.gluten.execution.datasource.{GlutenOrcWriterInjects, GlutenParquetWriterInjects, GlutenRowSplitter} import org.apache.gluten.expression.UDFMappings import org.apache.gluten.init.NativeBackendInitializer @@ -27,138 +26,76 @@ import org.apache.gluten.vectorized.{JniLibLoader, JniWorkspace} import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.api.plugin.PluginContext +import org.apache.spark.internal.Logging import org.apache.spark.sql.execution.datasources.velox.{VeloxOrcWriterInjects, VeloxParquetWriterInjects, VeloxRowSplitter} import org.apache.spark.sql.expression.UDFResolver import org.apache.spark.sql.internal.{GlutenConfigUtil, StaticSQLConf} -import org.apache.spark.util.SparkDirectoryUtil +import org.apache.spark.util.{SparkDirectoryUtil, SparkResourceUtil} import org.apache.commons.lang3.StringUtils -import scala.sys.process._ +import java.util.concurrent.atomic.AtomicBoolean -class VeloxListenerApi extends ListenerApi { - private val ARROW_VERSION = "1500" +class VeloxListenerApi extends ListenerApi with Logging { + import VeloxListenerApi._ override def onDriverStart(sc: SparkContext, pc: PluginContext): Unit = { val conf = pc.conf() - // sql table cache serializer + + // Sql table cache serializer. if (conf.getBoolean(GlutenConfig.COLUMNAR_TABLE_CACHE_ENABLED.key, defaultValue = false)) { conf.set( StaticSQLConf.SPARK_CACHE_SERIALIZER.key, "org.apache.spark.sql.execution.ColumnarCachedBatchSerializer") } - initialize(conf, isDriver = true) + + // Static initializers for driver. + if (!driverInitialized.compareAndSet(false, true)) { + // Make sure we call the static initializers only once. + logInfo( + "Skip rerunning static initializers since they are only supposed to run once." + + " You see this message probably because you are creating a new SparkSession.") + return + } + + SparkDirectoryUtil.init(conf) + UDFResolver.resolveUdfConf(conf, isDriver = true) + initialize(conf) } override def onDriverShutdown(): Unit = shutdown() override def onExecutorStart(pc: PluginContext): Unit = { - initialize(pc.conf(), isDriver = false) - } - - override def onExecutorShutdown(): Unit = shutdown() + val conf = pc.conf() - private def getLibraryLoaderForOS( - systemName: String, - systemVersion: String, - system: String): SharedLibraryLoader = { - if (systemName.contains("Ubuntu") && systemVersion.startsWith("20.04")) { - new SharedLibraryLoaderUbuntu2004 - } else if (systemName.contains("Ubuntu") && systemVersion.startsWith("22.04")) { - new SharedLibraryLoaderUbuntu2204 - } else if (systemName.contains("CentOS") && systemVersion.startsWith("9")) { - new SharedLibraryLoaderCentos9 - } else if (systemName.contains("CentOS") && systemVersion.startsWith("8")) { - new SharedLibraryLoaderCentos8 - } else if (systemName.contains("CentOS") && systemVersion.startsWith("7")) { - new SharedLibraryLoaderCentos7 - } else if (systemName.contains("Alibaba Cloud Linux") && systemVersion.startsWith("3")) { - new SharedLibraryLoaderCentos8 - } else if (systemName.contains("Alibaba Cloud Linux") && systemVersion.startsWith("2")) { - new SharedLibraryLoaderCentos7 - } else if (systemName.contains("Anolis") && systemVersion.startsWith("8")) { - new SharedLibraryLoaderCentos8 - } else if (systemName.contains("Anolis") && systemVersion.startsWith("7")) { - new SharedLibraryLoaderCentos7 - } else if (system.contains("tencentos") && system.contains("2.4")) { - new SharedLibraryLoaderCentos7 - } else if (system.contains("tencentos") && system.contains("3.2")) { - new SharedLibraryLoaderCentos8 - } else if (systemName.contains("Red Hat") && systemVersion.startsWith("9")) { - new SharedLibraryLoaderCentos9 - } else if (systemName.contains("Red Hat") && systemVersion.startsWith("8")) { - new SharedLibraryLoaderCentos8 - } else if (systemName.contains("Red Hat") && systemVersion.startsWith("7")) { - new SharedLibraryLoaderCentos7 - } else if (systemName.contains("Debian") && systemVersion.startsWith("11")) { - new SharedLibraryLoaderDebian11 - } else if (systemName.contains("Debian") && systemVersion.startsWith("12")) { - new SharedLibraryLoaderDebian12 - } else { - throw new GlutenException( - s"Found unsupported OS($systemName, $systemVersion)! Currently, Gluten's Velox backend" + - " only supports Ubuntu 20.04/22.04, CentOS 7/8, " + - "Alibaba Cloud Linux 2/3 & Anolis 7/8, tencentos 2.4/3.2, RedHat 7/8, " + - "Debian 11/12.") + // Static initializers for executor. + if (!executorInitialized.compareAndSet(false, true)) { + // Make sure we call the static initializers only once. + logInfo( + "Skip rerunning static initializers since they are only supposed to run once." + + " You see this message probably because you are creating a new SparkSession.") + return } - } - - private def loadLibFromJar(load: JniLibLoader, conf: SparkConf): Unit = { - val systemName = conf.getOption(GlutenConfig.GLUTEN_LOAD_LIB_OS) - val loader = if (systemName.isDefined) { - val systemVersion = conf.getOption(GlutenConfig.GLUTEN_LOAD_LIB_OS_VERSION) - if (systemVersion.isEmpty) { - throw new GlutenException( - s"${GlutenConfig.GLUTEN_LOAD_LIB_OS_VERSION} must be specified when specifies the " + - s"${GlutenConfig.GLUTEN_LOAD_LIB_OS}") - } - getLibraryLoaderForOS(systemName.get, systemVersion.get, "") - } else { - val system = "cat /etc/os-release".!! - val systemNamePattern = "^NAME=\"?(.*)\"?".r - val systemVersionPattern = "^VERSION=\"?(.*)\"?".r - val systemInfoLines = system.stripMargin.split("\n") - val systemNamePattern(systemName) = - systemInfoLines.find(_.startsWith("NAME=")).getOrElse("") - val systemVersionPattern(systemVersion) = - systemInfoLines.find(_.startsWith("VERSION=")).getOrElse("") - if (systemName.isEmpty || systemVersion.isEmpty) { - throw new GlutenException("Failed to get OS name and version info.") - } - getLibraryLoaderForOS(systemName, systemVersion, system) + if (inLocalMode(conf)) { + // Don't do static initializations from executor side in local mode. + // Driver already did that. + logInfo( + "Gluten is running with Spark local mode. Skip running static initializer for executor.") + return } - loader.loadLib(load) - } - private def loadLibWithLinux(conf: SparkConf, loader: JniLibLoader): Unit = { - if ( - conf.getBoolean( - GlutenConfig.GLUTEN_LOAD_LIB_FROM_JAR, - GlutenConfig.GLUTEN_LOAD_LIB_FROM_JAR_DEFAULT) - ) { - loadLibFromJar(loader, conf) - } + SparkDirectoryUtil.init(conf) + UDFResolver.resolveUdfConf(conf, isDriver = false) + initialize(conf) } - private def loadLibWithMacOS(loader: JniLibLoader): Unit = { - // Placeholder for loading shared libs on MacOS if user needs. - } + override def onExecutorShutdown(): Unit = shutdown() - private def initialize(conf: SparkConf, isDriver: Boolean): Unit = { - SparkDirectoryUtil.init(conf) - UDFResolver.resolveUdfConf(conf, isDriver = isDriver) + private def initialize(conf: SparkConf): Unit = { if (conf.getBoolean(GlutenConfig.GLUTEN_DEBUG_KEEP_JNI_WORKSPACE, defaultValue = false)) { val debugDir = conf.get(GlutenConfig.GLUTEN_DEBUG_KEEP_JNI_WORKSPACE_DIR) JniWorkspace.enableDebug(debugDir) } - val loader = JniWorkspace.getDefault.libLoader - - val osName = System.getProperty("os.name") - if (osName.startsWith("Mac OS X") || osName.startsWith("macOS")) { - loadLibWithMacOS(loader) - } else { - loadLibWithLinux(conf, loader) - } // Set the system properties. // Use appending policy for children with the same name in a arrow struct vector. @@ -167,6 +104,13 @@ class VeloxListenerApi extends ListenerApi { // Load supported hive/python/scala udfs UDFMappings.loadFromSparkConf(conf) + // Initial library loader. + val loader = JniWorkspace.getDefault.libLoader + + // Load shared native libraries the backend libraries depend on. + SharedLibraryLoader.load(conf, loader) + + // Load backend libraries. val libPath = conf.get(GlutenConfig.GLUTEN_LIB_PATH, StringUtils.EMPTY) if (StringUtils.isNotBlank(libPath)) { // Path based load. Ignore all other loadees. JniLibLoader.loadFromPath(libPath, false) @@ -176,11 +120,11 @@ class VeloxListenerApi extends ListenerApi { loader.mapAndLoad(VeloxBackend.BACKEND_NAME, false) } + // Initial native backend with configurations. val parsed = GlutenConfigUtil.parseConfig(conf.getAll.toMap) NativeBackendInitializer.initializeBackend(parsed) - // inject backend-specific implementations to override spark classes - // FIXME: The following set instances twice in local mode? + // Inject backend-specific implementations to override spark classes. GlutenParquetWriterInjects.setInstance(new VeloxParquetWriterInjects()) GlutenOrcWriterInjects.setInstance(new VeloxOrcWriterInjects()) GlutenRowSplitter.setInstance(new VeloxRowSplitter()) @@ -191,4 +135,13 @@ class VeloxListenerApi extends ListenerApi { } } -object VeloxListenerApi {} +object VeloxListenerApi { + // TODO: Implement graceful shutdown and remove these flags. + // As spark conf may change when active Spark session is recreated. + private val driverInitialized: AtomicBoolean = new AtomicBoolean(false) + private val executorInitialized: AtomicBoolean = new AtomicBoolean(false) + + private def inLocalMode(conf: SparkConf): Boolean = { + SparkResourceUtil.isLocalMaster(conf) + } +} diff --git a/backends-velox/src/main/scala/org/apache/gluten/backendsapi/velox/VeloxRuleApi.scala b/backends-velox/src/main/scala/org/apache/gluten/backendsapi/velox/VeloxRuleApi.scala new file mode 100644 index 0000000000000..f152da8858872 --- /dev/null +++ b/backends-velox/src/main/scala/org/apache/gluten/backendsapi/velox/VeloxRuleApi.scala @@ -0,0 +1,132 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.gluten.backendsapi.velox + +import org.apache.gluten.backendsapi.RuleApi +import org.apache.gluten.datasource.ArrowConvertorRule +import org.apache.gluten.extension.{ArrowScanReplaceRule, BloomFilterMightContainJointRewriteRule, CollectRewriteRule, FlushableHashAggregateRule, HLLRewriteRule} +import org.apache.gluten.extension.EmptySchemaWorkaround.{FallbackEmptySchemaRelation, PlanOneRowRelation} +import org.apache.gluten.extension.columnar._ +import org.apache.gluten.extension.columnar.MiscColumnarRules.{RemoveGlutenTableCacheColumnarToRow, RemoveTopmostColumnarToRow, RewriteSubqueryBroadcast, TransformPreOverrides} +import org.apache.gluten.extension.columnar.enumerated.EnumeratedTransform +import org.apache.gluten.extension.columnar.rewrite.RewriteSparkPlanRulesManager +import org.apache.gluten.extension.columnar.transition.{InsertTransitions, RemoveTransitions} +import org.apache.gluten.extension.injector.{RuleInjector, SparkInjector} +import org.apache.gluten.extension.injector.GlutenInjector.{LegacyInjector, RasInjector} +import org.apache.gluten.sql.shims.SparkShimLoader + +import org.apache.spark.sql.execution.{ColumnarCollapseTransformStages, GlutenFallbackReporter} +import org.apache.spark.sql.expression.UDFResolver +import org.apache.spark.util.SparkPlanRules + +class VeloxRuleApi extends RuleApi { + import VeloxRuleApi._ + + override def injectRules(injector: RuleInjector): Unit = { + injectSpark(injector.spark) + injectLegacy(injector.gluten.legacy) + injectRas(injector.gluten.ras) + } +} + +private object VeloxRuleApi { + def injectSpark(injector: SparkInjector): Unit = { + // Regular Spark rules. + injector.injectOptimizerRule(CollectRewriteRule.apply) + injector.injectOptimizerRule(HLLRewriteRule.apply) + UDFResolver.getFunctionSignatures.foreach(injector.injectFunction) + injector.injectPostHocResolutionRule(ArrowConvertorRule.apply) + } + + def injectLegacy(injector: LegacyInjector): Unit = { + // Gluten columnar: Transform rules. + injector.injectTransform(_ => RemoveTransitions) + injector.injectTransform(c => FallbackOnANSIMode.apply(c.session)) + injector.injectTransform(c => FallbackMultiCodegens.apply(c.session)) + injector.injectTransform(c => PlanOneRowRelation.apply(c.session)) + injector.injectTransform(_ => RewriteSubqueryBroadcast()) + injector.injectTransform(c => BloomFilterMightContainJointRewriteRule.apply(c.session)) + injector.injectTransform(c => ArrowScanReplaceRule.apply(c.session)) + injector.injectTransform(_ => FallbackEmptySchemaRelation()) + injector.injectTransform(_ => RewriteSparkPlanRulesManager()) + injector.injectTransform(_ => AddFallbackTagRule()) + injector.injectTransform(_ => TransformPreOverrides()) + injector.injectTransform(_ => RemoveNativeWriteFilesSortAndProject()) + injector.injectTransform(c => RewriteTransformer.apply(c.session)) + injector.injectTransform(_ => EnsureLocalSortRequirements) + injector.injectTransform(_ => EliminateLocalSort) + injector.injectTransform(_ => CollapseProjectExecTransformer) + injector.injectTransform(c => FlushableHashAggregateRule.apply(c.session)) + injector.injectTransform( + c => SparkPlanRules.extendedColumnarRule(c.conf.extendedColumnarTransformRules)(c.session)) + injector.injectTransform(c => InsertTransitions(c.outputsColumnar)) + + // Gluten columnar: Fallback policies. + injector.injectFallbackPolicy( + c => ExpandFallbackPolicy(c.ac.isAdaptiveContext(), c.ac.originalPlan())) + + // Gluten columnar: Post rules. + injector.injectPost(c => RemoveTopmostColumnarToRow(c.session, c.ac.isAdaptiveContext())) + SparkShimLoader.getSparkShims + .getExtendedColumnarPostRules() + .foreach(each => injector.injectPost(c => each(c.session))) + injector.injectPost(c => ColumnarCollapseTransformStages(c.conf)) + injector.injectTransform( + c => SparkPlanRules.extendedColumnarRule(c.conf.extendedColumnarPostRules)(c.session)) + + // Gluten columnar: Final rules. + injector.injectFinal(c => RemoveGlutenTableCacheColumnarToRow(c.session)) + injector.injectFinal(c => GlutenFallbackReporter(c.conf, c.session)) + injector.injectFinal(_ => RemoveFallbackTagRule()) + } + + def injectRas(injector: RasInjector): Unit = { + // Gluten RAS: Pre rules. + injector.inject(_ => RemoveTransitions) + injector.inject(c => FallbackOnANSIMode.apply(c.session)) + injector.inject(c => PlanOneRowRelation.apply(c.session)) + injector.inject(_ => FallbackEmptySchemaRelation()) + injector.inject(_ => RewriteSubqueryBroadcast()) + injector.inject(c => BloomFilterMightContainJointRewriteRule.apply(c.session)) + injector.inject(c => ArrowScanReplaceRule.apply(c.session)) + + // Gluten RAS: The RAS rule. + injector.inject(c => EnumeratedTransform(c.session, c.outputsColumnar)) + + // Gluten RAS: Post rules. + injector.inject(_ => RemoveTransitions) + injector.inject(_ => RemoveNativeWriteFilesSortAndProject()) + injector.inject(c => RewriteTransformer.apply(c.session)) + injector.inject(_ => EnsureLocalSortRequirements) + injector.inject(_ => EliminateLocalSort) + injector.inject(_ => CollapseProjectExecTransformer) + injector.inject(c => FlushableHashAggregateRule.apply(c.session)) + injector.inject( + c => SparkPlanRules.extendedColumnarRule(c.conf.extendedColumnarTransformRules)(c.session)) + injector.inject(c => InsertTransitions(c.outputsColumnar)) + injector.inject(c => RemoveTopmostColumnarToRow(c.session, c.ac.isAdaptiveContext())) + SparkShimLoader.getSparkShims + .getExtendedColumnarPostRules() + .foreach(each => injector.inject(c => each(c.session))) + injector.inject(c => ColumnarCollapseTransformStages(c.conf)) + injector.inject( + c => SparkPlanRules.extendedColumnarRule(c.conf.extendedColumnarPostRules)(c.session)) + injector.inject(c => RemoveGlutenTableCacheColumnarToRow(c.session)) + injector.inject(c => GlutenFallbackReporter(c.conf, c.session)) + injector.inject(_ => RemoveFallbackTagRule()) + } +} diff --git a/backends-velox/src/main/scala/org/apache/gluten/backendsapi/velox/VeloxSparkPlanExecApi.scala b/backends-velox/src/main/scala/org/apache/gluten/backendsapi/velox/VeloxSparkPlanExecApi.scala index 9e0d85e25a362..bd390004fedad 100644 --- a/backends-velox/src/main/scala/org/apache/gluten/backendsapi/velox/VeloxSparkPlanExecApi.scala +++ b/backends-velox/src/main/scala/org/apache/gluten/backendsapi/velox/VeloxSparkPlanExecApi.scala @@ -18,12 +18,10 @@ package org.apache.gluten.backendsapi.velox import org.apache.gluten.GlutenConfig import org.apache.gluten.backendsapi.SparkPlanExecApi -import org.apache.gluten.datasource.ArrowConvertorRule import org.apache.gluten.exception.GlutenNotSupportException import org.apache.gluten.execution._ import org.apache.gluten.expression._ import org.apache.gluten.expression.aggregate.{HLLAdapter, VeloxBloomFilterAggregate, VeloxCollectList, VeloxCollectSet} -import org.apache.gluten.extension._ import org.apache.gluten.extension.columnar.FallbackTags import org.apache.gluten.extension.columnar.transition.Convention import org.apache.gluten.extension.columnar.transition.ConventionFunc.BatchOverride @@ -36,25 +34,22 @@ import org.apache.spark.rdd.RDD import org.apache.spark.serializer.Serializer import org.apache.spark.shuffle.{GenShuffleWriterParameters, GlutenShuffleWriterWrapper} import org.apache.spark.shuffle.utils.ShuffleUtil -import org.apache.spark.sql.{SparkSession, Strategy} -import org.apache.spark.sql.catalyst.FunctionIdentifier -import org.apache.spark.sql.catalyst.analysis.FunctionRegistry.FunctionBuilder +import org.apache.spark.sql.catalyst.catalog.BucketSpec +import org.apache.spark.sql.catalyst.catalog.CatalogTypes.TablePartitionSpec import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.expressions.aggregate.AggregateExpression import org.apache.spark.sql.catalyst.optimizer.BuildSide import org.apache.spark.sql.catalyst.plans.JoinType -import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan import org.apache.spark.sql.catalyst.plans.physical._ -import org.apache.spark.sql.catalyst.rules.Rule import org.apache.spark.sql.execution._ import org.apache.spark.sql.execution.columnar.InMemoryTableScanExec -import org.apache.spark.sql.execution.datasources.WriteJobDescription +import org.apache.spark.sql.execution.datasources.FileFormat import org.apache.spark.sql.execution.exchange.{BroadcastExchangeExec, ShuffleExchangeExec} import org.apache.spark.sql.execution.joins.{BuildSideRelation, HashedRelationBroadcastMode} import org.apache.spark.sql.execution.metric.SQLMetric import org.apache.spark.sql.execution.python.ArrowEvalPythonExec import org.apache.spark.sql.execution.utils.ExecUtil -import org.apache.spark.sql.expression.{UDFExpression, UDFResolver, UserDefinedAggregateFunction} +import org.apache.spark.sql.expression.{UDFExpression, UserDefinedAggregateFunction} import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types._ import org.apache.spark.sql.vectorized.ColumnarBatch @@ -63,8 +58,6 @@ import org.apache.commons.lang3.ClassUtils import javax.ws.rs.core.UriBuilder -import scala.collection.mutable.ListBuffer - class VeloxSparkPlanExecApi extends SparkPlanExecApi { /** The columnar-batch type this backend is using. */ @@ -546,6 +539,17 @@ class VeloxSparkPlanExecApi extends SparkPlanExecApi { } // scalastyle:on argcount + /** Determine whether to use sort-based shuffle based on shuffle partitioning and output. */ + override def useSortBasedShuffle(partitioning: Partitioning, output: Seq[Attribute]): Boolean = { + val conf = GlutenConfig.getConf + lazy val isCelebornSortBasedShuffle = conf.isUseCelebornShuffleManager && + conf.celebornShuffleWriterType == GlutenConfig.GLUTEN_SORT_SHUFFLE_WRITER + partitioning != SinglePartition && + (partitioning.numPartitions >= GlutenConfig.getConf.columnarShuffleSortPartitionsThreshold || + output.size >= GlutenConfig.getConf.columnarShuffleSortColumnsThreshold) || + isCelebornSortBasedShuffle + } + /** * Generate ColumnarShuffleWriter for ColumnarShuffleManager. * @@ -555,9 +559,22 @@ class VeloxSparkPlanExecApi extends SparkPlanExecApi { parameters: GenShuffleWriterParameters[K, V]): GlutenShuffleWriterWrapper[K, V] = { ShuffleUtil.genColumnarShuffleWriter(parameters) } - - override def createBackendWrite(description: WriteJobDescription): BackendWrite = { - VeloxBackendWrite(description) + override def createColumnarWriteFilesExec( + child: SparkPlan, + noop: SparkPlan, + fileFormat: FileFormat, + partitionColumns: Seq[Attribute], + bucketSpec: Option[BucketSpec], + options: Map[String, String], + staticPartitions: TablePartitionSpec): ColumnarWriteFilesExec = { + VeloxColumnarWriteFilesExec( + child, + noop, + fileFormat, + partitionColumns, + bucketSpec, + options, + staticPartitions) } override def createColumnarArrowEvalPythonExec( @@ -635,17 +652,6 @@ class VeloxSparkPlanExecApi extends SparkPlanExecApi { * * Expressions. */ - /** Generate StringSplit transformer. */ - override def genStringSplitTransformer( - substraitExprName: String, - srcExpr: ExpressionTransformer, - regexExpr: ExpressionTransformer, - limitExpr: ExpressionTransformer, - original: StringSplit): ExpressionTransformer = { - // In velox, split function just support tow args, not support limit arg for now - VeloxStringSplitTransformer(substraitExprName, srcExpr, regexExpr, limitExpr, original) - } - /** * Generate Alias transformer. * @@ -745,74 +751,6 @@ class VeloxSparkPlanExecApi extends SparkPlanExecApi { } } - /** - * * Rules and strategies. - */ - - /** - * Generate extended DataSourceV2 Strategy. - * - * @return - */ - override def genExtendedDataSourceV2Strategies(): List[SparkSession => Strategy] = List() - - /** - * Generate extended query stage preparation rules. - * - * @return - */ - override def genExtendedQueryStagePrepRules(): List[SparkSession => Rule[SparkPlan]] = List() - - /** - * Generate extended Analyzer. - * - * @return - */ - override def genExtendedAnalyzers(): List[SparkSession => Rule[LogicalPlan]] = List() - - /** - * Generate extended Optimizer. Currently only for Velox backend. - * - * @return - */ - override def genExtendedOptimizers(): List[SparkSession => Rule[LogicalPlan]] = - List(CollectRewriteRule.apply, HLLRewriteRule.apply) - - /** - * Generate extended columnar pre-rules, in the validation phase. - * - * @return - */ - override def genExtendedColumnarValidationRules(): List[SparkSession => Rule[SparkPlan]] = { - List(BloomFilterMightContainJointRewriteRule.apply, ArrowScanReplaceRule.apply) - } - - /** - * Generate extended columnar pre-rules. - * - * @return - */ - override def genExtendedColumnarTransformRules(): List[SparkSession => Rule[SparkPlan]] = { - val buf: ListBuffer[SparkSession => Rule[SparkPlan]] = ListBuffer() - if (GlutenConfig.getConf.enableVeloxFlushablePartialAggregation) { - buf += FlushableHashAggregateRule.apply - } - buf.result - } - - override def genInjectPostHocResolutionRules(): List[SparkSession => Rule[LogicalPlan]] = { - List(ArrowConvertorRule) - } - - /** - * Generate extended Strategy. - * - * @return - */ - override def genExtendedStrategies(): List[SparkSession => Strategy] = { - List() - } - /** Define backend specfic expression mappings. */ override def extraExpressionMappings: Seq[Sig] = { Seq( @@ -829,11 +767,6 @@ class VeloxSparkPlanExecApi extends SparkPlanExecApi { ) } - override def genInjectedFunctions() - : Seq[(FunctionIdentifier, ExpressionInfo, FunctionBuilder)] = { - UDFResolver.getFunctionSignatures - } - override def rewriteSpillPath(path: String): String = { val fs = GlutenConfig.getConf.veloxSpillFileSystem fs match { diff --git a/backends-velox/src/main/scala/org/apache/gluten/backendsapi/velox/VeloxTransformerApi.scala b/backends-velox/src/main/scala/org/apache/gluten/backendsapi/velox/VeloxTransformerApi.scala index 4cbde635e9bda..37d8acc7f2b7c 100644 --- a/backends-velox/src/main/scala/org/apache/gluten/backendsapi/velox/VeloxTransformerApi.scala +++ b/backends-velox/src/main/scala/org/apache/gluten/backendsapi/velox/VeloxTransformerApi.scala @@ -17,8 +17,8 @@ package org.apache.gluten.backendsapi.velox import org.apache.gluten.backendsapi.TransformerApi -import org.apache.gluten.exec.Runtimes import org.apache.gluten.expression.ConverterUtils +import org.apache.gluten.runtime.Runtimes import org.apache.gluten.substrait.expression.{ExpressionBuilder, ExpressionNode} import org.apache.gluten.utils.InputPartitionsUtil import org.apache.gluten.vectorized.PlanEvaluatorJniWrapper diff --git a/backends-velox/src/main/scala/org/apache/gluten/datasource/ArrowCSVFileFormat.scala b/backends-velox/src/main/scala/org/apache/gluten/datasource/ArrowCSVFileFormat.scala index a8e65b0539c7c..5629811f4d226 100644 --- a/backends-velox/src/main/scala/org/apache/gluten/datasource/ArrowCSVFileFormat.scala +++ b/backends-velox/src/main/scala/org/apache/gluten/datasource/ArrowCSVFileFormat.scala @@ -34,7 +34,6 @@ import org.apache.spark.sql.catalyst.expressions.{AttributeReference, JoinedRow} import org.apache.spark.sql.catalyst.expressions.codegen.GenerateUnsafeProjection import org.apache.spark.sql.execution.datasources.{FileFormat, HadoopFileLinesReader, OutputWriterFactory, PartitionedFile} import org.apache.spark.sql.execution.datasources.csv.CSVDataSource -import org.apache.spark.sql.execution.metric.SQLMetric import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.sources.{DataSourceRegister, Filter} import org.apache.spark.sql.types.{StructField, StructType} @@ -310,16 +309,9 @@ object ArrowCSVFileFormat { schema: StructType, batchSize: Int, it: Iterator[InternalRow]): Iterator[ColumnarBatch] = { - // note, these metrics are unused but just make `RowToVeloxColumnarExec` happy - val numInputRows = new SQLMetric("numInputRows") - val numOutputBatches = new SQLMetric("numOutputBatches") - val convertTime = new SQLMetric("convertTime") val veloxBatch = RowToVeloxColumnarExec.toColumnarBatchIterator( it, schema, - numInputRows, - numOutputBatches, - convertTime, batchSize ) veloxBatch diff --git a/backends-velox/src/main/scala/org/apache/gluten/execution/GenerateExecTransformer.scala b/backends-velox/src/main/scala/org/apache/gluten/execution/GenerateExecTransformer.scala index c7b81d55fa067..be76ba54ed725 100644 --- a/backends-velox/src/main/scala/org/apache/gluten/execution/GenerateExecTransformer.scala +++ b/backends-velox/src/main/scala/org/apache/gluten/execution/GenerateExecTransformer.scala @@ -228,7 +228,7 @@ object PullOutGenerateProjectHelper extends PullOutProjectHelper { } } - newProjections += Alias(CreateArray(fieldArray), generatePreAliasName)() + newProjections += Alias(CreateArray(fieldArray.toSeq), generatePreAliasName)() } // Plug in a Project between Generate and its child. diff --git a/backends-velox/src/main/scala/org/apache/gluten/execution/HashAggregateExecTransformer.scala b/backends-velox/src/main/scala/org/apache/gluten/execution/HashAggregateExecTransformer.scala index 4f33ae7c718ca..9c5b68e7bff1b 100644 --- a/backends-velox/src/main/scala/org/apache/gluten/execution/HashAggregateExecTransformer.scala +++ b/backends-velox/src/main/scala/org/apache/gluten/execution/HashAggregateExecTransformer.scala @@ -396,7 +396,8 @@ abstract class HashAggregateExecTransformer( childNodes.add(expressionNode) } } - exprNodes.add(getRowConstructNode(args, childNodes, newInputAttributes, aggFunc)) + exprNodes.add( + getRowConstructNode(args, childNodes, newInputAttributes.toSeq, aggFunc)) case other => throw new GlutenNotSupportException(s"$other is not supported.") } diff --git a/backends-velox/src/main/scala/org/apache/gluten/execution/RowToVeloxColumnarExec.scala b/backends-velox/src/main/scala/org/apache/gluten/execution/RowToVeloxColumnarExec.scala index 2f3e88f9af9cb..aa30cc80d4db6 100644 --- a/backends-velox/src/main/scala/org/apache/gluten/execution/RowToVeloxColumnarExec.scala +++ b/backends-velox/src/main/scala/org/apache/gluten/execution/RowToVeloxColumnarExec.scala @@ -18,8 +18,8 @@ package org.apache.gluten.execution import org.apache.gluten.GlutenConfig import org.apache.gluten.columnarbatch.ColumnarBatches -import org.apache.gluten.exec.Runtimes import org.apache.gluten.memory.arrow.alloc.ArrowBufferAllocators +import org.apache.gluten.runtime.Runtimes import org.apache.gluten.utils.ArrowAbiUtil import org.apache.gluten.utils.iterator.Iterators import org.apache.gluten.vectorized._ @@ -92,6 +92,23 @@ case class RowToVeloxColumnarExec(child: SparkPlan) extends RowToColumnarExecBas } object RowToVeloxColumnarExec { + + def toColumnarBatchIterator( + it: Iterator[InternalRow], + schema: StructType, + columnBatchSize: Int): Iterator[ColumnarBatch] = { + val numInputRows = new SQLMetric("numInputRows") + val numOutputBatches = new SQLMetric("numOutputBatches") + val convertTime = new SQLMetric("convertTime") + RowToVeloxColumnarExec.toColumnarBatchIterator( + it, + schema, + numInputRows, + numOutputBatches, + convertTime, + columnBatchSize) + } + def toColumnarBatchIterator( it: Iterator[InternalRow], schema: StructType, diff --git a/backends-velox/src/main/scala/org/apache/gluten/execution/VeloxColumnarToRowExec.scala b/backends-velox/src/main/scala/org/apache/gluten/execution/VeloxColumnarToRowExec.scala index 993a888b91df2..4bd553b012351 100644 --- a/backends-velox/src/main/scala/org/apache/gluten/execution/VeloxColumnarToRowExec.scala +++ b/backends-velox/src/main/scala/org/apache/gluten/execution/VeloxColumnarToRowExec.scala @@ -18,8 +18,8 @@ package org.apache.gluten.execution import org.apache.gluten.columnarbatch.ColumnarBatches import org.apache.gluten.exception.GlutenNotSupportException -import org.apache.gluten.exec.Runtimes import org.apache.gluten.extension.ValidationResult +import org.apache.gluten.runtime.Runtimes import org.apache.gluten.utils.iterator.Iterators import org.apache.gluten.vectorized.NativeColumnarToRowJniWrapper @@ -99,6 +99,22 @@ case class VeloxColumnarToRowExec(child: SparkPlan) extends ColumnarToRowExecBas } object VeloxColumnarToRowExec { + + def toRowIterator( + batches: Iterator[ColumnarBatch], + output: Seq[Attribute]): Iterator[InternalRow] = { + val numOutputRows = new SQLMetric("numOutputRows") + val numInputBatches = new SQLMetric("numInputBatches") + val convertTime = new SQLMetric("convertTime") + toRowIterator( + batches, + output, + numOutputRows, + numInputBatches, + convertTime + ) + } + def toRowIterator( batches: Iterator[ColumnarBatch], output: Seq[Attribute], diff --git a/backends-velox/src/main/scala/org/apache/gluten/expression/ExpressionTransformer.scala b/backends-velox/src/main/scala/org/apache/gluten/expression/ExpressionTransformer.scala index 51b19ab140d9a..71e58f124fc88 100644 --- a/backends-velox/src/main/scala/org/apache/gluten/expression/ExpressionTransformer.scala +++ b/backends-velox/src/main/scala/org/apache/gluten/expression/ExpressionTransformer.scala @@ -107,33 +107,3 @@ case class VeloxHashExpressionTransformer( ExpressionBuilder.makeScalarFunction(functionId, nodes, typeNode) } } - -case class VeloxStringSplitTransformer( - substraitExprName: String, - srcExpr: ExpressionTransformer, - regexExpr: ExpressionTransformer, - limitExpr: ExpressionTransformer, - original: StringSplit) - extends ExpressionTransformer { - // TODO: split function support limit arg - override def children: Seq[ExpressionTransformer] = srcExpr :: regexExpr :: Nil - - override def doTransform(args: java.lang.Object): ExpressionNode = { - if ( - !regexExpr.isInstanceOf[LiteralTransformer] || - !limitExpr.isInstanceOf[LiteralTransformer] - ) { - throw new GlutenNotSupportException( - "Gluten only supports literal input as limit/regex for split function.") - } - - val limit = limitExpr.doTransform(args).asInstanceOf[IntLiteralNode].getValue - val regex = regexExpr.doTransform(args).asInstanceOf[StringLiteralNode].getValue - if (limit > 0 || regex.length > 1) { - throw new GlutenNotSupportException( - s"$original supported single-length regex and negative limit, but given $limit and $regex") - } - - super.doTransform(args) - } -} diff --git a/backends-velox/src/main/scala/org/apache/gluten/extension/EmptySchemaWorkaround.scala b/backends-velox/src/main/scala/org/apache/gluten/extension/EmptySchemaWorkaround.scala new file mode 100644 index 0000000000000..3f34e7fc262d6 --- /dev/null +++ b/backends-velox/src/main/scala/org/apache/gluten/extension/EmptySchemaWorkaround.scala @@ -0,0 +1,131 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.gluten.extension + +import org.apache.gluten.GlutenConfig +import org.apache.gluten.extension.columnar.FallbackTags + +import org.apache.spark.sql.SparkSession +import org.apache.spark.sql.catalyst.InternalRow +import org.apache.spark.sql.catalyst.expressions.{Alias, AttributeReference, EulerNumber, Expression, Literal, MakeYMInterval, Pi, Rand, SparkPartitionID, SparkVersion, Uuid} +import org.apache.spark.sql.catalyst.expressions.aggregate.{Count, Sum} +import org.apache.spark.sql.catalyst.rules.Rule +import org.apache.spark.sql.execution.{ProjectExec, RDDScanExec, SparkPlan, UnaryExecNode} +import org.apache.spark.sql.execution.aggregate.HashAggregateExec +import org.apache.spark.sql.execution.datasources.WriteFilesExec +import org.apache.spark.sql.types.StringType + +/** Rules to make Velox backend work correctly with query plans that have empty output schemas. */ +object EmptySchemaWorkaround { + + /** + * This rule plans [[RDDScanExec]] with a fake schema to make gluten work, because gluten does not + * support empty output relation, see [[FallbackEmptySchemaRelation]]. + */ + case class PlanOneRowRelation(spark: SparkSession) extends Rule[SparkPlan] { + override def apply(plan: SparkPlan): SparkPlan = { + if (!GlutenConfig.getConf.enableOneRowRelationColumnar) { + return plan + } + + plan.transform { + // We should make sure the output does not change, e.g. + // Window + // OneRowRelation + case u: UnaryExecNode + if u.child.isInstanceOf[RDDScanExec] && + u.child.asInstanceOf[RDDScanExec].name == "OneRowRelation" && + u.outputSet != u.child.outputSet => + val rdd = spark.sparkContext.parallelize(InternalRow(null) :: Nil, 1) + val attr = AttributeReference("fake_column", StringType)() + u.withNewChildren(RDDScanExec(attr :: Nil, rdd, "OneRowRelation") :: Nil) + } + } + } + + /** + * FIXME To be removed: Since Velox backend is the only one to use the strategy, and we already + * support offloading zero-column batch in ColumnarBatchInIterator via PR #3309. + * + * We'd make sure all Velox operators be able to handle zero-column input correctly then remove + * the rule together with [[PlanOneRowRelation]]. + */ + case class FallbackEmptySchemaRelation() extends Rule[SparkPlan] { + override def apply(plan: SparkPlan): SparkPlan = plan.transformDown { + case p => + if (fallbackOnEmptySchema(p)) { + if (p.children.exists(_.output.isEmpty)) { + // Some backends are not eligible to offload plan with zero-column input. + // If any child have empty output, mark the plan and that child as UNSUPPORTED. + FallbackTags.add(p, "at least one of its children has empty output") + p.children.foreach { + child => + if (child.output.isEmpty && !child.isInstanceOf[WriteFilesExec]) { + FallbackTags.add(child, "at least one of its children has empty output") + } + } + } + } + p + } + + private def fallbackOnEmptySchema(plan: SparkPlan): Boolean = { + // Count(1) and Sum(1) are special cases that Velox backend can handle. + // Do not fallback it and its children in the first place. + !mayNeedOffload(plan) + } + + /** + * Check whether a plan needs to be offloaded even though they have empty input schema, e.g, + * Sum(1), Count(1), rand(), etc. + * @param plan: + * The Spark plan to check. + * + * Since https://github.com/apache/incubator-gluten/pull/2749. + */ + private def mayNeedOffload(plan: SparkPlan): Boolean = { + def checkExpr(expr: Expression): Boolean = { + expr match { + // Block directly falling back the below functions by FallbackEmptySchemaRelation. + case alias: Alias => checkExpr(alias.child) + case _: Rand | _: Uuid | _: MakeYMInterval | _: SparkPartitionID | _: EulerNumber | + _: Pi | _: SparkVersion => + true + case _ => false + } + } + + plan match { + case exec: HashAggregateExec if exec.aggregateExpressions.nonEmpty => + // Check Sum(Literal) or Count(Literal). + exec.aggregateExpressions.forall( + expression => { + val aggFunction = expression.aggregateFunction + aggFunction match { + case Sum(Literal(_, _), _) => true + case Count(Seq(Literal(_, _))) => true + case _ => false + } + }) + case p: ProjectExec if p.projectList.nonEmpty => + p.projectList.forall(checkExpr(_)) + case _ => + false + } + } + } +} diff --git a/backends-velox/src/main/scala/org/apache/gluten/extension/FlushableHashAggregateRule.scala b/backends-velox/src/main/scala/org/apache/gluten/extension/FlushableHashAggregateRule.scala index 3137d6e6aef54..04bdbe1efb512 100644 --- a/backends-velox/src/main/scala/org/apache/gluten/extension/FlushableHashAggregateRule.scala +++ b/backends-velox/src/main/scala/org/apache/gluten/extension/FlushableHashAggregateRule.scala @@ -16,6 +16,7 @@ */ package org.apache.gluten.extension +import org.apache.gluten.GlutenConfig import org.apache.gluten.execution._ import org.apache.spark.sql.SparkSession @@ -31,27 +32,32 @@ import org.apache.spark.sql.execution.exchange.ShuffleExchangeLike */ case class FlushableHashAggregateRule(session: SparkSession) extends Rule[SparkPlan] { import FlushableHashAggregateRule._ - override def apply(plan: SparkPlan): SparkPlan = plan.transformUp { - case s: ShuffleExchangeLike => - // If an exchange follows a hash aggregate in which all functions are in partial mode, - // then it's safe to convert the hash aggregate to flushable hash aggregate. - val out = s.withNewChildren( - List( - replaceEligibleAggregates(s.child) { - agg => - FlushableHashAggregateExecTransformer( - agg.requiredChildDistributionExpressions, - agg.groupingExpressions, - agg.aggregateExpressions, - agg.aggregateAttributes, - agg.initialInputBufferOffset, - agg.resultExpressions, - agg.child - ) - } + override def apply(plan: SparkPlan): SparkPlan = { + if (!GlutenConfig.getConf.enableVeloxFlushablePartialAggregation) { + return plan + } + plan.transformUp { + case s: ShuffleExchangeLike => + // If an exchange follows a hash aggregate in which all functions are in partial mode, + // then it's safe to convert the hash aggregate to flushable hash aggregate. + val out = s.withNewChildren( + List( + replaceEligibleAggregates(s.child) { + agg => + FlushableHashAggregateExecTransformer( + agg.requiredChildDistributionExpressions, + agg.groupingExpressions, + agg.aggregateExpressions, + agg.aggregateAttributes, + agg.initialInputBufferOffset, + agg.resultExpressions, + agg.child + ) + } + ) ) - ) - out + out + } } private def replaceEligibleAggregates(plan: SparkPlan)( diff --git a/backends-velox/src/main/scala/org/apache/gluten/utils/DatasourceUtil.scala b/backends-velox/src/main/scala/org/apache/gluten/utils/DatasourceUtil.scala index 3d7725655bb12..8963ce93c1f69 100644 --- a/backends-velox/src/main/scala/org/apache/gluten/utils/DatasourceUtil.scala +++ b/backends-velox/src/main/scala/org/apache/gluten/utils/DatasourceUtil.scala @@ -17,8 +17,8 @@ package org.apache.gluten.utils import org.apache.gluten.datasource.DatasourceJniWrapper -import org.apache.gluten.exec.Runtimes import org.apache.gluten.memory.arrow.alloc.ArrowBufferAllocators +import org.apache.gluten.runtime.Runtimes import org.apache.spark.sql.types.StructType import org.apache.spark.sql.utils.SparkSchemaUtil diff --git a/backends-velox/src/main/scala/org/apache/gluten/utils/SharedLibraryLoader.scala b/backends-velox/src/main/scala/org/apache/gluten/utils/SharedLibraryLoader.scala index 137da83c09800..1f3ca30de9f59 100755 --- a/backends-velox/src/main/scala/org/apache/gluten/utils/SharedLibraryLoader.scala +++ b/backends-velox/src/main/scala/org/apache/gluten/utils/SharedLibraryLoader.scala @@ -16,8 +16,112 @@ */ package org.apache.gluten.utils +import org.apache.gluten.GlutenConfig +import org.apache.gluten.exception.GlutenException import org.apache.gluten.vectorized.JniLibLoader +import org.apache.spark.SparkConf + +import scala.sys.process._ + trait SharedLibraryLoader { def loadLib(loader: JniLibLoader): Unit } + +object SharedLibraryLoader { + def load(conf: SparkConf, jni: JniLibLoader): Unit = { + val shouldLoad = conf.getBoolean( + GlutenConfig.GLUTEN_LOAD_LIB_FROM_JAR, + GlutenConfig.GLUTEN_LOAD_LIB_FROM_JAR_DEFAULT) + if (!shouldLoad) { + return + } + val osName = System.getProperty("os.name") + if (osName.startsWith("Mac OS X") || osName.startsWith("macOS")) { + loadLibWithMacOS(jni) + } else { + loadLibWithLinux(conf, jni) + } + } + + private def loadLibWithLinux(conf: SparkConf, jni: JniLibLoader): Unit = { + val loader = find(conf) + loader.loadLib(jni) + } + + private def loadLibWithMacOS(jni: JniLibLoader): Unit = { + // Placeholder for loading shared libs on MacOS if user needs. + } + + private def find(conf: SparkConf): SharedLibraryLoader = { + val systemName = conf.getOption(GlutenConfig.GLUTEN_LOAD_LIB_OS) + val loader = if (systemName.isDefined) { + val systemVersion = conf.getOption(GlutenConfig.GLUTEN_LOAD_LIB_OS_VERSION) + if (systemVersion.isEmpty) { + throw new GlutenException( + s"${GlutenConfig.GLUTEN_LOAD_LIB_OS_VERSION} must be specified when specifies the " + + s"${GlutenConfig.GLUTEN_LOAD_LIB_OS}") + } + getForOS(systemName.get, systemVersion.get, "") + } else { + val system = "cat /etc/os-release".!! + val systemNamePattern = "^NAME=\"?(.*)\"?".r + val systemVersionPattern = "^VERSION=\"?(.*)\"?".r + val systemInfoLines = system.stripMargin.split("\n") + val systemNamePattern(systemName) = + systemInfoLines.find(_.startsWith("NAME=")).getOrElse("") + val systemVersionPattern(systemVersion) = + systemInfoLines.find(_.startsWith("VERSION=")).getOrElse("") + if (systemName.isEmpty || systemVersion.isEmpty) { + throw new GlutenException("Failed to get OS name and version info.") + } + getForOS(systemName, systemVersion, system) + } + loader + } + + private def getForOS( + systemName: String, + systemVersion: String, + system: String): SharedLibraryLoader = { + if (systemName.contains("Ubuntu") && systemVersion.startsWith("20.04")) { + new SharedLibraryLoaderUbuntu2004 + } else if (systemName.contains("Ubuntu") && systemVersion.startsWith("22.04")) { + new SharedLibraryLoaderUbuntu2204 + } else if (systemName.contains("CentOS") && systemVersion.startsWith("9")) { + new SharedLibraryLoaderCentos9 + } else if (systemName.contains("CentOS") && systemVersion.startsWith("8")) { + new SharedLibraryLoaderCentos8 + } else if (systemName.contains("CentOS") && systemVersion.startsWith("7")) { + new SharedLibraryLoaderCentos7 + } else if (systemName.contains("Alibaba Cloud Linux") && systemVersion.startsWith("3")) { + new SharedLibraryLoaderCentos8 + } else if (systemName.contains("Alibaba Cloud Linux") && systemVersion.startsWith("2")) { + new SharedLibraryLoaderCentos7 + } else if (systemName.contains("Anolis") && systemVersion.startsWith("8")) { + new SharedLibraryLoaderCentos8 + } else if (systemName.contains("Anolis") && systemVersion.startsWith("7")) { + new SharedLibraryLoaderCentos7 + } else if (system.contains("tencentos") && system.contains("2.4")) { + new SharedLibraryLoaderCentos7 + } else if (system.contains("tencentos") && system.contains("3.2")) { + new SharedLibraryLoaderCentos8 + } else if (systemName.contains("Red Hat") && systemVersion.startsWith("9")) { + new SharedLibraryLoaderCentos9 + } else if (systemName.contains("Red Hat") && systemVersion.startsWith("8")) { + new SharedLibraryLoaderCentos8 + } else if (systemName.contains("Red Hat") && systemVersion.startsWith("7")) { + new SharedLibraryLoaderCentos7 + } else if (systemName.contains("Debian") && systemVersion.startsWith("11")) { + new SharedLibraryLoaderDebian11 + } else if (systemName.contains("Debian") && systemVersion.startsWith("12")) { + new SharedLibraryLoaderDebian12 + } else { + throw new GlutenException( + s"Found unsupported OS($systemName, $systemVersion)! Currently, Gluten's Velox backend" + + " only supports Ubuntu 20.04/22.04, CentOS 7/8, " + + "Alibaba Cloud Linux 2/3 & Anolis 7/8, tencentos 2.4/3.2, RedHat 7/8, " + + "Debian 11/12.") + } + } +} diff --git a/backends-velox/src/main/scala/org/apache/gluten/utils/SharedLibraryLoaderCentos7.scala b/backends-velox/src/main/scala/org/apache/gluten/utils/SharedLibraryLoaderCentos7.scala index 47ed2c47cbb5d..a7750dcb3e65b 100755 --- a/backends-velox/src/main/scala/org/apache/gluten/utils/SharedLibraryLoaderCentos7.scala +++ b/backends-velox/src/main/scala/org/apache/gluten/utils/SharedLibraryLoaderCentos7.scala @@ -20,26 +20,26 @@ import org.apache.gluten.vectorized.JniLibLoader class SharedLibraryLoaderCentos7 extends SharedLibraryLoader { override def loadLib(loader: JniLibLoader): Unit = { - loader - .newTransaction() - .loadAndCreateLink("libboost_atomic.so.1.84.0", "libboost_atomic.so", false) - .loadAndCreateLink("libboost_thread.so.1.84.0", "libboost_thread.so", false) - .loadAndCreateLink("libboost_system.so.1.84.0", "libboost_system.so", false) - .loadAndCreateLink("libboost_regex.so.1.84.0", "libboost_regex.so", false) - .loadAndCreateLink("libboost_program_options.so.1.84.0", "libboost_program_options.so", false) - .loadAndCreateLink("libboost_filesystem.so.1.84.0", "libboost_filesystem.so", false) - .loadAndCreateLink("libboost_context.so.1.84.0", "libboost_context.so", false) - .loadAndCreateLink("libdouble-conversion.so.1", "libdouble-conversion.so", false) - .loadAndCreateLink("libevent-2.0.so.5", "libevent-2.0.so", false) - .loadAndCreateLink("libgflags.so.2.2", "libgflags.so", false) - .loadAndCreateLink("libglog.so.0", "libglog.so", false) - .loadAndCreateLink("libntlm.so.0", "libntlm.so", false) - .loadAndCreateLink("libgsasl.so.7", "libgsasl.so", false) - .loadAndCreateLink("libprotobuf.so.32", "libprotobuf.so", false) - .loadAndCreateLink("libhdfs3.so.1", "libhdfs3.so", false) - .loadAndCreateLink("libre2.so.10", "libre2.so", false) - .loadAndCreateLink("libzstd.so.1", "libzstd.so", false) - .loadAndCreateLink("liblz4.so.1", "liblz4.so", false) - .commit() + loader.loadAndCreateLink("libboost_atomic.so.1.84.0", "libboost_atomic.so", false) + loader.loadAndCreateLink("libboost_thread.so.1.84.0", "libboost_thread.so", false) + loader.loadAndCreateLink("libboost_system.so.1.84.0", "libboost_system.so", false) + loader.loadAndCreateLink("libboost_regex.so.1.84.0", "libboost_regex.so", false) + loader.loadAndCreateLink( + "libboost_program_options.so.1.84.0", + "libboost_program_options.so", + false) + loader.loadAndCreateLink("libboost_filesystem.so.1.84.0", "libboost_filesystem.so", false) + loader.loadAndCreateLink("libboost_context.so.1.84.0", "libboost_context.so", false) + loader.loadAndCreateLink("libdouble-conversion.so.1", "libdouble-conversion.so", false) + loader.loadAndCreateLink("libevent-2.0.so.5", "libevent-2.0.so", false) + loader.loadAndCreateLink("libgflags.so.2.2", "libgflags.so", false) + loader.loadAndCreateLink("libglog.so.0", "libglog.so", false) + loader.loadAndCreateLink("libntlm.so.0", "libntlm.so", false) + loader.loadAndCreateLink("libgsasl.so.7", "libgsasl.so", false) + loader.loadAndCreateLink("libprotobuf.so.32", "libprotobuf.so", false) + loader.loadAndCreateLink("libhdfs3.so.1", "libhdfs3.so", false) + loader.loadAndCreateLink("libre2.so.10", "libre2.so", false) + loader.loadAndCreateLink("libzstd.so.1", "libzstd.so", false) + loader.loadAndCreateLink("liblz4.so.1", "liblz4.so", false) } } diff --git a/backends-velox/src/main/scala/org/apache/gluten/utils/SharedLibraryLoaderCentos8.scala b/backends-velox/src/main/scala/org/apache/gluten/utils/SharedLibraryLoaderCentos8.scala index c1d3bf2e26cb7..bd8bf15bec9f0 100755 --- a/backends-velox/src/main/scala/org/apache/gluten/utils/SharedLibraryLoaderCentos8.scala +++ b/backends-velox/src/main/scala/org/apache/gluten/utils/SharedLibraryLoaderCentos8.scala @@ -20,30 +20,30 @@ import org.apache.gluten.vectorized.JniLibLoader class SharedLibraryLoaderCentos8 extends SharedLibraryLoader { override def loadLib(loader: JniLibLoader): Unit = { - loader - .newTransaction() - .loadAndCreateLink("libboost_atomic.so.1.84.0", "libboost_atomic.so", false) - .loadAndCreateLink("libboost_thread.so.1.84.0", "libboost_thread.so", false) - .loadAndCreateLink("libboost_system.so.1.84.0", "libboost_system.so", false) - .loadAndCreateLink("libicudata.so.60", "libicudata.so", false) - .loadAndCreateLink("libicuuc.so.60", "libicuuc.so", false) - .loadAndCreateLink("libicui18n.so.60", "libicui18n.so", false) - .loadAndCreateLink("libboost_regex.so.1.84.0", "libboost_regex.so", false) - .loadAndCreateLink("libboost_program_options.so.1.84.0", "libboost_program_options.so", false) - .loadAndCreateLink("libboost_filesystem.so.1.84.0", "libboost_filesystem.so", false) - .loadAndCreateLink("libboost_context.so.1.84.0", "libboost_context.so", false) - .loadAndCreateLink("libdouble-conversion.so.3", "libdouble-conversion.so", false) - .loadAndCreateLink("libevent-2.1.so.6", "libevent-2.1.so", false) - .loadAndCreateLink("libgflags.so.2.2", "libgflags.so", false) - .loadAndCreateLink("libglog.so.1", "libglog.so", false) - .loadAndCreateLink("libdwarf.so.1", "libdwarf.so", false) - .loadAndCreateLink("libidn.so.11", "libidn.so", false) - .loadAndCreateLink("libntlm.so.0", "libntlm.so", false) - .loadAndCreateLink("libgsasl.so.7", "libgsasl.so", false) - .loadAndCreateLink("libprotobuf.so.32", "libprotobuf.so", false) - .loadAndCreateLink("libhdfs3.so.1", "libhdfs3.so", false) - .loadAndCreateLink("libre2.so.0", "libre2.so", false) - .loadAndCreateLink("libsodium.so.23", "libsodium.so", false) - .commit() + loader.loadAndCreateLink("libboost_atomic.so.1.84.0", "libboost_atomic.so", false) + loader.loadAndCreateLink("libboost_thread.so.1.84.0", "libboost_thread.so", false) + loader.loadAndCreateLink("libboost_system.so.1.84.0", "libboost_system.so", false) + loader.loadAndCreateLink("libicudata.so.60", "libicudata.so", false) + loader.loadAndCreateLink("libicuuc.so.60", "libicuuc.so", false) + loader.loadAndCreateLink("libicui18n.so.60", "libicui18n.so", false) + loader.loadAndCreateLink("libboost_regex.so.1.84.0", "libboost_regex.so", false) + loader.loadAndCreateLink( + "libboost_program_options.so.1.84.0", + "libboost_program_options.so", + false) + loader.loadAndCreateLink("libboost_filesystem.so.1.84.0", "libboost_filesystem.so", false) + loader.loadAndCreateLink("libboost_context.so.1.84.0", "libboost_context.so", false) + loader.loadAndCreateLink("libdouble-conversion.so.3", "libdouble-conversion.so", false) + loader.loadAndCreateLink("libevent-2.1.so.6", "libevent-2.1.so", false) + loader.loadAndCreateLink("libgflags.so.2.2", "libgflags.so", false) + loader.loadAndCreateLink("libglog.so.1", "libglog.so", false) + loader.loadAndCreateLink("libdwarf.so.1", "libdwarf.so", false) + loader.loadAndCreateLink("libidn.so.11", "libidn.so", false) + loader.loadAndCreateLink("libntlm.so.0", "libntlm.so", false) + loader.loadAndCreateLink("libgsasl.so.7", "libgsasl.so", false) + loader.loadAndCreateLink("libprotobuf.so.32", "libprotobuf.so", false) + loader.loadAndCreateLink("libhdfs3.so.1", "libhdfs3.so", false) + loader.loadAndCreateLink("libre2.so.0", "libre2.so", false) + loader.loadAndCreateLink("libsodium.so.23", "libsodium.so", false) } } diff --git a/backends-velox/src/main/scala/org/apache/gluten/utils/SharedLibraryLoaderCentos9.scala b/backends-velox/src/main/scala/org/apache/gluten/utils/SharedLibraryLoaderCentos9.scala index 2d9ececb366de..06fb25c8b0e42 100755 --- a/backends-velox/src/main/scala/org/apache/gluten/utils/SharedLibraryLoaderCentos9.scala +++ b/backends-velox/src/main/scala/org/apache/gluten/utils/SharedLibraryLoaderCentos9.scala @@ -20,30 +20,30 @@ import org.apache.gluten.vectorized.JniLibLoader class SharedLibraryLoaderCentos9 extends SharedLibraryLoader { override def loadLib(loader: JniLibLoader): Unit = { - loader - .newTransaction() - .loadAndCreateLink("libboost_atomic.so.1.84.0", "libboost_atomic.so", false) - .loadAndCreateLink("libboost_thread.so.1.84.0", "libboost_thread.so", false) - .loadAndCreateLink("libboost_system.so.1.84.0", "libboost_system.so", false) - .loadAndCreateLink("libicudata.so.67", "libicudata.so", false) - .loadAndCreateLink("libicuuc.so.67", "libicuuc.so", false) - .loadAndCreateLink("libicui18n.so.67", "libicui18n.so", false) - .loadAndCreateLink("libboost_regex.so.1.84.0", "libboost_regex.so", false) - .loadAndCreateLink("libboost_program_options.so.1.84.0", "libboost_program_options.so", false) - .loadAndCreateLink("libboost_filesystem.so.1.84.0", "libboost_filesystem.so", false) - .loadAndCreateLink("libboost_context.so.1.84.0", "libboost_context.so", false) - .loadAndCreateLink("libdouble-conversion.so.3", "libdouble-conversion.so", false) - .loadAndCreateLink("libevent-2.1.so.7", "libevent-2.1.so", false) - .loadAndCreateLink("libgflags.so.2.2", "libgflags.so", false) - .loadAndCreateLink("libglog.so.1", "libglog.so", false) - .loadAndCreateLink("libdwarf.so.0", "libdwarf.so", false) - .loadAndCreateLink("libidn.so.12", "libidn.so", false) - .loadAndCreateLink("libntlm.so.0", "libntlm.so", false) - .loadAndCreateLink("libgsasl.so.7", "libgsasl.so", false) - .loadAndCreateLink("libprotobuf.so.32", "libprotobuf.so", false) - .loadAndCreateLink("libhdfs3.so.1", "libhdfs3.so", false) - .loadAndCreateLink("libre2.so.9", "libre2.so", false) - .loadAndCreateLink("libsodium.so.23", "libsodium.so", false) - .commit() + loader.loadAndCreateLink("libboost_atomic.so.1.84.0", "libboost_atomic.so", false) + loader.loadAndCreateLink("libboost_thread.so.1.84.0", "libboost_thread.so", false) + loader.loadAndCreateLink("libboost_system.so.1.84.0", "libboost_system.so", false) + loader.loadAndCreateLink("libicudata.so.67", "libicudata.so", false) + loader.loadAndCreateLink("libicuuc.so.67", "libicuuc.so", false) + loader.loadAndCreateLink("libicui18n.so.67", "libicui18n.so", false) + loader.loadAndCreateLink("libboost_regex.so.1.84.0", "libboost_regex.so", false) + loader.loadAndCreateLink( + "libboost_program_options.so.1.84.0", + "libboost_program_options.so", + false) + loader.loadAndCreateLink("libboost_filesystem.so.1.84.0", "libboost_filesystem.so", false) + loader.loadAndCreateLink("libboost_context.so.1.84.0", "libboost_context.so", false) + loader.loadAndCreateLink("libdouble-conversion.so.3", "libdouble-conversion.so", false) + loader.loadAndCreateLink("libevent-2.1.so.7", "libevent-2.1.so", false) + loader.loadAndCreateLink("libgflags.so.2.2", "libgflags.so", false) + loader.loadAndCreateLink("libglog.so.1", "libglog.so", false) + loader.loadAndCreateLink("libdwarf.so.0", "libdwarf.so", false) + loader.loadAndCreateLink("libidn.so.12", "libidn.so", false) + loader.loadAndCreateLink("libntlm.so.0", "libntlm.so", false) + loader.loadAndCreateLink("libgsasl.so.7", "libgsasl.so", false) + loader.loadAndCreateLink("libprotobuf.so.32", "libprotobuf.so", false) + loader.loadAndCreateLink("libhdfs3.so.1", "libhdfs3.so", false) + loader.loadAndCreateLink("libre2.so.9", "libre2.so", false) + loader.loadAndCreateLink("libsodium.so.23", "libsodium.so", false) } } diff --git a/backends-velox/src/main/scala/org/apache/gluten/utils/SharedLibraryLoaderDebian11.scala b/backends-velox/src/main/scala/org/apache/gluten/utils/SharedLibraryLoaderDebian11.scala index ca7d1d22d9840..a300cc5b9d057 100644 --- a/backends-velox/src/main/scala/org/apache/gluten/utils/SharedLibraryLoaderDebian11.scala +++ b/backends-velox/src/main/scala/org/apache/gluten/utils/SharedLibraryLoaderDebian11.scala @@ -20,33 +20,30 @@ import org.apache.gluten.vectorized.JniLibLoader class SharedLibraryLoaderDebian11 extends SharedLibraryLoader { override def loadLib(loader: JniLibLoader): Unit = { - loader - .newTransaction() - .loadAndCreateLink("libicudata.so.67", "libicudata.so", false) - .loadAndCreateLink("libre2.so.9", "libre2.so", false) - .loadAndCreateLink("libicuuc.so.67", "libicuuc.so", false) - .loadAndCreateLink("liblber-2.4.so.2", "liblber-2.4.so", false) - .loadAndCreateLink("libsasl2.so.2", "libsasl2.so", false) - .loadAndCreateLink("libbrotlicommon.so.1", "libbrotlicommon.so", false) - .loadAndCreateLink("libicui18n.so.67", "libicui18n.so", false) - .loadAndCreateLink("libunwind.so.8", "libunwind.so", false) - .loadAndCreateLink("libgflags.so.2.2", "libgflags.so", false) - .loadAndCreateLink("libnghttp2.so.14", "libnghttp2.so", false) - .loadAndCreateLink("librtmp.so.1", "librtmp.so", false) - .loadAndCreateLink("libssh2.so.1", "libssh2.so", false) - .loadAndCreateLink("libpsl.so.5", "libpsl.so", false) - .loadAndCreateLink("libldap_r-2.4.so.2", "libldap_r-2.4.so", false) - .loadAndCreateLink("libbrotlidec.so.1", "libbrotlidec.so", false) - .loadAndCreateLink("libthrift-0.13.0.so", "libthrift.so", false) - .loadAndCreateLink("libboost_context.so.1.84.0", "libboost_context.so", false) - .loadAndCreateLink("libboost_regex.so.1.84.0", "libboost_regex.so", false) - .loadAndCreateLink("libdouble-conversion.so.3", "libdouble-conversion.so", false) - .loadAndCreateLink("libglog.so.0", "libglog.so", false) - .loadAndCreateLink("libevent-2.1.so.7", "libevent-2.1.so", false) - .loadAndCreateLink("libsnappy.so.1", "libsnappy.so", false) - .loadAndCreateLink("libcurl.so.4", "libcurl.so", false) - .loadAndCreateLink("libprotobuf.so.32", "libprotobuf.so", false) - .loadAndCreateLink("libhdfs3.so.1", "libhdfs3.so", false) - .commit() + loader.loadAndCreateLink("libicudata.so.67", "libicudata.so", false) + loader.loadAndCreateLink("libre2.so.9", "libre2.so", false) + loader.loadAndCreateLink("libicuuc.so.67", "libicuuc.so", false) + loader.loadAndCreateLink("liblber-2.4.so.2", "liblber-2.4.so", false) + loader.loadAndCreateLink("libsasl2.so.2", "libsasl2.so", false) + loader.loadAndCreateLink("libbrotlicommon.so.1", "libbrotlicommon.so", false) + loader.loadAndCreateLink("libicui18n.so.67", "libicui18n.so", false) + loader.loadAndCreateLink("libunwind.so.8", "libunwind.so", false) + loader.loadAndCreateLink("libgflags.so.2.2", "libgflags.so", false) + loader.loadAndCreateLink("libnghttp2.so.14", "libnghttp2.so", false) + loader.loadAndCreateLink("librtmp.so.1", "librtmp.so", false) + loader.loadAndCreateLink("libssh2.so.1", "libssh2.so", false) + loader.loadAndCreateLink("libpsl.so.5", "libpsl.so", false) + loader.loadAndCreateLink("libldap_r-2.4.so.2", "libldap_r-2.4.so", false) + loader.loadAndCreateLink("libbrotlidec.so.1", "libbrotlidec.so", false) + loader.loadAndCreateLink("libthrift-0.13.0.so", "libthrift.so", false) + loader.loadAndCreateLink("libboost_context.so.1.84.0", "libboost_context.so", false) + loader.loadAndCreateLink("libboost_regex.so.1.84.0", "libboost_regex.so", false) + loader.loadAndCreateLink("libdouble-conversion.so.3", "libdouble-conversion.so", false) + loader.loadAndCreateLink("libglog.so.0", "libglog.so", false) + loader.loadAndCreateLink("libevent-2.1.so.7", "libevent-2.1.so", false) + loader.loadAndCreateLink("libsnappy.so.1", "libsnappy.so", false) + loader.loadAndCreateLink("libcurl.so.4", "libcurl.so", false) + loader.loadAndCreateLink("libprotobuf.so.32", "libprotobuf.so", false) + loader.loadAndCreateLink("libhdfs3.so.1", "libhdfs3.so", false) } } diff --git a/backends-velox/src/main/scala/org/apache/gluten/utils/SharedLibraryLoaderDebian12.scala b/backends-velox/src/main/scala/org/apache/gluten/utils/SharedLibraryLoaderDebian12.scala index 128c8eaa2aef2..8e24ef0bad52a 100644 --- a/backends-velox/src/main/scala/org/apache/gluten/utils/SharedLibraryLoaderDebian12.scala +++ b/backends-velox/src/main/scala/org/apache/gluten/utils/SharedLibraryLoaderDebian12.scala @@ -20,39 +20,36 @@ import org.apache.gluten.vectorized.JniLibLoader class SharedLibraryLoaderDebian12 extends SharedLibraryLoader { override def loadLib(loader: JniLibLoader): Unit = { - loader - .newTransaction() - .loadAndCreateLink("libcrypto.so.3", "libcrypto.so", false) - .loadAndCreateLink("libkrb5support.so.0", "libkrb5support.so", false) - .loadAndCreateLink("libssl.so.3", "libssl.so", false) - .loadAndCreateLink("libicudata.so.72", "libicudata.so", false) - .loadAndCreateLink("libk5crypto.so.3", "libk5crypto.so", false) - .loadAndCreateLink("libkeyutils.so.1", "libkeyutils.so", false) - .loadAndCreateLink("libsnappy.so.1", "libsnappy.so", false) - .loadAndCreateLink("libthrift-0.17.0.so", "libthrift.so", false) - .loadAndCreateLink("libicuuc.so.72", "libicuuc.so", false) - .loadAndCreateLink("libkrb5.so.3", "libkrb5.so", false) - .loadAndCreateLink("liblber-2.5.so.0", "liblber-2.4.so", false) - .loadAndCreateLink("libsasl2.so.2", "libsasl2.so", false) - .loadAndCreateLink("libbrotlicommon.so.1", "libbrotlicommon.so", false) - .loadAndCreateLink("libicui18n.so.72", "libicui18n.so", false) - .loadAndCreateLink("libgflags.so.2.2", "libgflags.so", false) - .loadAndCreateLink("libunwind.so.8", "libunwind.so", false) - .loadAndCreateLink("libnghttp2.so.14", "libnghttp2.so", false) - .loadAndCreateLink("librtmp.so.1", "librtmp.so", false) - .loadAndCreateLink("libssh2.so.1", "libssh2.so", false) - .loadAndCreateLink("libpsl.so.5", "libpsl.so", false) - .loadAndCreateLink("libgssapi_krb5.so.2", "libgssapi_krb5.so", false) - .loadAndCreateLink("libldap-2.5.so.0", "libldap_r-2.4.so", false) - .loadAndCreateLink("libbrotlidec.so.1", "libbrotlidec.so", false) - .loadAndCreateLink("libboost_context.so.1.84.0", "libboost_context.so", false) - .loadAndCreateLink("libboost_regex.so.1.84.0", "libboost_regex.so", false) - .loadAndCreateLink("libdouble-conversion.so.3", "libdouble-conversion.so", false) - .loadAndCreateLink("libglog.so.1", "libglog.so", false) - .loadAndCreateLink("libevent-2.1.so.7", "libevent-2.1.so", false) - .loadAndCreateLink("libcurl.so.4", "libcurl.so", false) - .loadAndCreateLink("libprotobuf.so.32", "libprotobuf.so", false) - .loadAndCreateLink("libhdfs3.so.1", "libhdfs3.so", false) - .commit() + loader.loadAndCreateLink("libcrypto.so.3", "libcrypto.so", false) + loader.loadAndCreateLink("libkrb5support.so.0", "libkrb5support.so", false) + loader.loadAndCreateLink("libssl.so.3", "libssl.so", false) + loader.loadAndCreateLink("libicudata.so.72", "libicudata.so", false) + loader.loadAndCreateLink("libk5crypto.so.3", "libk5crypto.so", false) + loader.loadAndCreateLink("libkeyutils.so.1", "libkeyutils.so", false) + loader.loadAndCreateLink("libsnappy.so.1", "libsnappy.so", false) + loader.loadAndCreateLink("libthrift-0.17.0.so", "libthrift.so", false) + loader.loadAndCreateLink("libicuuc.so.72", "libicuuc.so", false) + loader.loadAndCreateLink("libkrb5.so.3", "libkrb5.so", false) + loader.loadAndCreateLink("liblber-2.5.so.0", "liblber-2.4.so", false) + loader.loadAndCreateLink("libsasl2.so.2", "libsasl2.so", false) + loader.loadAndCreateLink("libbrotlicommon.so.1", "libbrotlicommon.so", false) + loader.loadAndCreateLink("libicui18n.so.72", "libicui18n.so", false) + loader.loadAndCreateLink("libgflags.so.2.2", "libgflags.so", false) + loader.loadAndCreateLink("libunwind.so.8", "libunwind.so", false) + loader.loadAndCreateLink("libnghttp2.so.14", "libnghttp2.so", false) + loader.loadAndCreateLink("librtmp.so.1", "librtmp.so", false) + loader.loadAndCreateLink("libssh2.so.1", "libssh2.so", false) + loader.loadAndCreateLink("libpsl.so.5", "libpsl.so", false) + loader.loadAndCreateLink("libgssapi_krb5.so.2", "libgssapi_krb5.so", false) + loader.loadAndCreateLink("libldap-2.5.so.0", "libldap_r-2.4.so", false) + loader.loadAndCreateLink("libbrotlidec.so.1", "libbrotlidec.so", false) + loader.loadAndCreateLink("libboost_context.so.1.84.0", "libboost_context.so", false) + loader.loadAndCreateLink("libboost_regex.so.1.84.0", "libboost_regex.so", false) + loader.loadAndCreateLink("libdouble-conversion.so.3", "libdouble-conversion.so", false) + loader.loadAndCreateLink("libglog.so.1", "libglog.so", false) + loader.loadAndCreateLink("libevent-2.1.so.7", "libevent-2.1.so", false) + loader.loadAndCreateLink("libcurl.so.4", "libcurl.so", false) + loader.loadAndCreateLink("libprotobuf.so.32", "libprotobuf.so", false) + loader.loadAndCreateLink("libhdfs3.so.1", "libhdfs3.so", false) } } diff --git a/backends-velox/src/main/scala/org/apache/gluten/utils/SharedLibraryLoaderUbuntu2004.scala b/backends-velox/src/main/scala/org/apache/gluten/utils/SharedLibraryLoaderUbuntu2004.scala index 18f2e6cfbeb32..a03a0a7e87c88 100755 --- a/backends-velox/src/main/scala/org/apache/gluten/utils/SharedLibraryLoaderUbuntu2004.scala +++ b/backends-velox/src/main/scala/org/apache/gluten/utils/SharedLibraryLoaderUbuntu2004.scala @@ -20,49 +20,46 @@ import org.apache.gluten.vectorized.JniLibLoader class SharedLibraryLoaderUbuntu2004 extends SharedLibraryLoader { override def loadLib(loader: JniLibLoader): Unit = { - loader - .newTransaction() - .loadAndCreateLink("libroken.so.18", "libroken.so", false) - .loadAndCreateLink("libasn1.so.8", "libasn1.so", false) - .loadAndCreateLink("libboost_context.so.1.84.0", "libboost_context.so", false) - .loadAndCreateLink("libboost_regex.so.1.84.0", "libboost_regex.so", false) - .loadAndCreateLink("libbrotlicommon.so.1", "libbrotlicommon.so", false) - .loadAndCreateLink("libbrotlidec.so.1", "libbrotlidec.so", false) - .loadAndCreateLink("libpsl.so.5", "libpsl.so", false) - .loadAndCreateLink("libcrypto.so.1.1", "libcrypto.so", false) - .loadAndCreateLink("libnghttp2.so.14", "libnghttp2.so", false) - .loadAndCreateLink("libnettle.so.7", "libnettle.so", false) - .loadAndCreateLink("libhogweed.so.5", "libhogweed.so", false) - .loadAndCreateLink("librtmp.so.1", "librtmp.so", false) - .loadAndCreateLink("libssh.so.4", "libssh.so", false) - .loadAndCreateLink("libssl.so.1.1", "libssl.so", false) - .loadAndCreateLink("liblber-2.4.so.2", "liblber-2.4.so", false) - .loadAndCreateLink("libsasl2.so.2", "libsasl2.so", false) - .loadAndCreateLink("libwind.so.0", "libwind.so", false) - .loadAndCreateLink("libheimbase.so.1", "libheimbase.so", false) - .loadAndCreateLink("libhcrypto.so.4", "libhcrypto.so", false) - .loadAndCreateLink("libhx509.so.5", "libhx509.so", false) - .loadAndCreateLink("libkrb5.so.26", "libkrb5.so", false) - .loadAndCreateLink("libheimntlm.so.0", "libheimntlm.so", false) - .loadAndCreateLink("libgssapi.so.3", "libgssapi.so", false) - .loadAndCreateLink("libldap_r-2.4.so.2", "libldap_r-2.4.so", false) - .loadAndCreateLink("libcurl.so.4", "libcurl.so", false) - .loadAndCreateLink("libdouble-conversion.so.3", "libdouble-conversion.so", false) - .loadAndCreateLink("libevent-2.1.so.7", "libevent-2.1.so", false) - .loadAndCreateLink("libgflags.so.2.2", "libgflags.so", false) - .loadAndCreateLink("libunwind.so.8", "libunwind.so", false) - .loadAndCreateLink("libglog.so.0", "libglog.so", false) - .loadAndCreateLink("libidn.so.11", "libidn.so", false) - .loadAndCreateLink("libntlm.so.0", "libntlm.so", false) - .loadAndCreateLink("libgsasl.so.7", "libgsasl.so", false) - .loadAndCreateLink("libprotobuf.so.32", "libprotobuf.so", false) - .loadAndCreateLink("libicudata.so.66", "libicudata.so", false) - .loadAndCreateLink("libicuuc.so.66", "libicuuc.so", false) - .loadAndCreateLink("libxml2.so.2", "libxml2.so", false) - .loadAndCreateLink("libhdfs3.so.1", "libhdfs3.so", false) - .loadAndCreateLink("libre2.so.5", "libre2.so", false) - .loadAndCreateLink("libsnappy.so.1", "libsnappy.so", false) - .loadAndCreateLink("libthrift-0.13.0.so", "libthrift.so", false) - .commit() + loader.loadAndCreateLink("libroken.so.18", "libroken.so", false) + loader.loadAndCreateLink("libasn1.so.8", "libasn1.so", false) + loader.loadAndCreateLink("libboost_context.so.1.84.0", "libboost_context.so", false) + loader.loadAndCreateLink("libboost_regex.so.1.84.0", "libboost_regex.so", false) + loader.loadAndCreateLink("libbrotlicommon.so.1", "libbrotlicommon.so", false) + loader.loadAndCreateLink("libbrotlidec.so.1", "libbrotlidec.so", false) + loader.loadAndCreateLink("libpsl.so.5", "libpsl.so", false) + loader.loadAndCreateLink("libcrypto.so.1.1", "libcrypto.so", false) + loader.loadAndCreateLink("libnghttp2.so.14", "libnghttp2.so", false) + loader.loadAndCreateLink("libnettle.so.7", "libnettle.so", false) + loader.loadAndCreateLink("libhogweed.so.5", "libhogweed.so", false) + loader.loadAndCreateLink("librtmp.so.1", "librtmp.so", false) + loader.loadAndCreateLink("libssh.so.4", "libssh.so", false) + loader.loadAndCreateLink("libssl.so.1.1", "libssl.so", false) + loader.loadAndCreateLink("liblber-2.4.so.2", "liblber-2.4.so", false) + loader.loadAndCreateLink("libsasl2.so.2", "libsasl2.so", false) + loader.loadAndCreateLink("libwind.so.0", "libwind.so", false) + loader.loadAndCreateLink("libheimbase.so.1", "libheimbase.so", false) + loader.loadAndCreateLink("libhcrypto.so.4", "libhcrypto.so", false) + loader.loadAndCreateLink("libhx509.so.5", "libhx509.so", false) + loader.loadAndCreateLink("libkrb5.so.26", "libkrb5.so", false) + loader.loadAndCreateLink("libheimntlm.so.0", "libheimntlm.so", false) + loader.loadAndCreateLink("libgssapi.so.3", "libgssapi.so", false) + loader.loadAndCreateLink("libldap_r-2.4.so.2", "libldap_r-2.4.so", false) + loader.loadAndCreateLink("libcurl.so.4", "libcurl.so", false) + loader.loadAndCreateLink("libdouble-conversion.so.3", "libdouble-conversion.so", false) + loader.loadAndCreateLink("libevent-2.1.so.7", "libevent-2.1.so", false) + loader.loadAndCreateLink("libgflags.so.2.2", "libgflags.so", false) + loader.loadAndCreateLink("libunwind.so.8", "libunwind.so", false) + loader.loadAndCreateLink("libglog.so.0", "libglog.so", false) + loader.loadAndCreateLink("libidn.so.11", "libidn.so", false) + loader.loadAndCreateLink("libntlm.so.0", "libntlm.so", false) + loader.loadAndCreateLink("libgsasl.so.7", "libgsasl.so", false) + loader.loadAndCreateLink("libprotobuf.so.32", "libprotobuf.so", false) + loader.loadAndCreateLink("libicudata.so.66", "libicudata.so", false) + loader.loadAndCreateLink("libicuuc.so.66", "libicuuc.so", false) + loader.loadAndCreateLink("libxml2.so.2", "libxml2.so", false) + loader.loadAndCreateLink("libhdfs3.so.1", "libhdfs3.so", false) + loader.loadAndCreateLink("libre2.so.5", "libre2.so", false) + loader.loadAndCreateLink("libsnappy.so.1", "libsnappy.so", false) + loader.loadAndCreateLink("libthrift-0.13.0.so", "libthrift.so", false) } } diff --git a/backends-velox/src/main/scala/org/apache/gluten/utils/SharedLibraryLoaderUbuntu2204.scala b/backends-velox/src/main/scala/org/apache/gluten/utils/SharedLibraryLoaderUbuntu2204.scala index b23105b7dce05..4bb2b59b33ea1 100755 --- a/backends-velox/src/main/scala/org/apache/gluten/utils/SharedLibraryLoaderUbuntu2204.scala +++ b/backends-velox/src/main/scala/org/apache/gluten/utils/SharedLibraryLoaderUbuntu2204.scala @@ -20,34 +20,31 @@ import org.apache.gluten.vectorized.JniLibLoader class SharedLibraryLoaderUbuntu2204 extends SharedLibraryLoader { override def loadLib(loader: JniLibLoader): Unit = { - loader - .newTransaction() - .loadAndCreateLink("libboost_context.so.1.84.0", "libboost_context.so", false) - .loadAndCreateLink("libicudata.so.70", "libicudata.so", false) - .loadAndCreateLink("libicuuc.so.70", "libicuuc.so", false) - .loadAndCreateLink("libicui18n.so.70", "libicui18n.so", false) - .loadAndCreateLink("libboost_regex.so.1.84.0", "libboost_regex.so", false) - .loadAndCreateLink("libnghttp2.so.14", "libnghttp2.so", false) - .loadAndCreateLink("librtmp.so.1", "librtmp.so", false) - .loadAndCreateLink("libssh.so.4", "libssh.so", false) - .loadAndCreateLink("libsasl2.so.2", "libsasl2.so", false) - .loadAndCreateLink("liblber-2.5.so.0", "liblber-2.5.so", false) - .loadAndCreateLink("libldap-2.5.so.0", "libldap-2.5.so", false) - .loadAndCreateLink("libcurl.so.4", "libcurl.so", false) - .loadAndCreateLink("libdouble-conversion.so.3", "libdouble-conversion.so", false) - .loadAndCreateLink("libevent-2.1.so.7", "libevent-2.1.so", false) - .loadAndCreateLink("libgflags.so.2.2", "libgflags.so", false) - .loadAndCreateLink("libunwind.so.8", "libunwind.so", false) - .loadAndCreateLink("libglog.so.0", "libglog.so", false) - .loadAndCreateLink("libidn.so.12", "libidn.so", false) - .loadAndCreateLink("libntlm.so.0", "libntlm.so", false) - .loadAndCreateLink("libgsasl.so.7", "libgsasl.so", false) - .loadAndCreateLink("libprotobuf.so.32", "libprotobuf.so", false) - .loadAndCreateLink("libxml2.so.2", "libxml2.so", false) - .loadAndCreateLink("libhdfs3.so.1", "libhdfs3.so", false) - .loadAndCreateLink("libre2.so.9", "libre2.so", false) - .loadAndCreateLink("libsnappy.so.1", "libsnappy.so", false) - .loadAndCreateLink("libthrift-0.16.0.so", "libthrift.so", false) - .commit() + loader.loadAndCreateLink("libboost_context.so.1.84.0", "libboost_context.so", false) + loader.loadAndCreateLink("libicudata.so.70", "libicudata.so", false) + loader.loadAndCreateLink("libicuuc.so.70", "libicuuc.so", false) + loader.loadAndCreateLink("libicui18n.so.70", "libicui18n.so", false) + loader.loadAndCreateLink("libboost_regex.so.1.84.0", "libboost_regex.so", false) + loader.loadAndCreateLink("libnghttp2.so.14", "libnghttp2.so", false) + loader.loadAndCreateLink("librtmp.so.1", "librtmp.so", false) + loader.loadAndCreateLink("libssh.so.4", "libssh.so", false) + loader.loadAndCreateLink("libsasl2.so.2", "libsasl2.so", false) + loader.loadAndCreateLink("liblber-2.5.so.0", "liblber-2.5.so", false) + loader.loadAndCreateLink("libldap-2.5.so.0", "libldap-2.5.so", false) + loader.loadAndCreateLink("libcurl.so.4", "libcurl.so", false) + loader.loadAndCreateLink("libdouble-conversion.so.3", "libdouble-conversion.so", false) + loader.loadAndCreateLink("libevent-2.1.so.7", "libevent-2.1.so", false) + loader.loadAndCreateLink("libgflags.so.2.2", "libgflags.so", false) + loader.loadAndCreateLink("libunwind.so.8", "libunwind.so", false) + loader.loadAndCreateLink("libglog.so.0", "libglog.so", false) + loader.loadAndCreateLink("libidn.so.12", "libidn.so", false) + loader.loadAndCreateLink("libntlm.so.0", "libntlm.so", false) + loader.loadAndCreateLink("libgsasl.so.7", "libgsasl.so", false) + loader.loadAndCreateLink("libprotobuf.so.32", "libprotobuf.so", false) + loader.loadAndCreateLink("libxml2.so.2", "libxml2.so", false) + loader.loadAndCreateLink("libhdfs3.so.1", "libhdfs3.so", false) + loader.loadAndCreateLink("libre2.so.9", "libre2.so", false) + loader.loadAndCreateLink("libsnappy.so.1", "libsnappy.so", false) + loader.loadAndCreateLink("libthrift-0.16.0.so", "libthrift.so", false) } } diff --git a/backends-velox/src/main/scala/org/apache/spark/api/python/ColumnarArrowEvalPythonExec.scala b/backends-velox/src/main/scala/org/apache/spark/api/python/ColumnarArrowEvalPythonExec.scala index 88280ff2eddee..0e01c9d5d82fa 100644 --- a/backends-velox/src/main/scala/org/apache/spark/api/python/ColumnarArrowEvalPythonExec.scala +++ b/backends-velox/src/main/scala/org/apache/spark/api/python/ColumnarArrowEvalPythonExec.scala @@ -44,7 +44,7 @@ import java.io.{DataInputStream, DataOutputStream} import java.net.Socket import java.util.concurrent.atomic.AtomicBoolean -import scala.collection.{mutable, Seq} +import scala.collection.mutable import scala.collection.mutable.ArrayBuffer class ColumnarArrowPythonRunner( @@ -54,7 +54,7 @@ class ColumnarArrowPythonRunner( schema: StructType, timeZoneId: String, conf: Map[String, String]) - extends BasePythonRunnerShim(funcs, evalType, argOffsets) { + extends BasePythonRunnerShim(funcs.toSeq, evalType, argOffsets) { override val simplifiedTraceback: Boolean = SQLConf.get.pysparkSimplifiedTraceback @@ -239,7 +239,7 @@ case class ColumnarArrowEvalPythonExec( val arrowSafeTypeCheck = Seq( SQLConf.PANDAS_ARROW_SAFE_TYPE_CONVERSION.key -> conf.arrowSafeTypeConversion.toString) - Map(timeZoneConf ++ pandasColsByName ++ arrowSafeTypeCheck: _*) + Map(timeZoneConf.toSeq ++ pandasColsByName.toSeq ++ arrowSafeTypeCheck: _*) } private val pythonRunnerConf = getPythonRunnerConfMap(conf) @@ -280,7 +280,7 @@ case class ColumnarArrowEvalPythonExec( case children => // There should not be any other UDFs, or the children can't be evaluated directly. assert(children.forall(_.find(_.isInstanceOf[PythonUDF]).isEmpty)) - (ChainedPythonFunctions(Seq(udf.func)), udf.children) + (ChainedPythonFunctions(Seq(udf.func).toSeq), udf.children) } } @@ -410,7 +410,7 @@ object PullOutArrowEvalPythonPreProjectHelper extends PullOutProjectHelper { val (chained, children) = collectFunctions(u) (ChainedPythonFunctions(chained.funcs ++ Seq(udf.func)), children) case children => - (ChainedPythonFunctions(Seq(udf.func)), udf.children) + (ChainedPythonFunctions(Seq(udf.func).toSeq), udf.children) } } diff --git a/backends-velox/src/main/scala/org/apache/spark/sql/execution/BroadcastUtils.scala b/backends-velox/src/main/scala/org/apache/spark/sql/execution/BroadcastUtils.scala index 3d532133a0537..29a12f532bea7 100644 --- a/backends-velox/src/main/scala/org/apache/spark/sql/execution/BroadcastUtils.scala +++ b/backends-velox/src/main/scala/org/apache/spark/sql/execution/BroadcastUtils.scala @@ -17,7 +17,7 @@ package org.apache.spark.sql.execution import org.apache.gluten.columnarbatch.ColumnarBatches -import org.apache.gluten.exec.Runtimes +import org.apache.gluten.runtime.Runtimes import org.apache.gluten.sql.shims.SparkShimLoader import org.apache.gluten.vectorized.{ColumnarBatchSerializeResult, ColumnarBatchSerializerJniWrapper} diff --git a/backends-velox/src/main/scala/org/apache/spark/sql/execution/ColumnarCachedBatchSerializer.scala b/backends-velox/src/main/scala/org/apache/spark/sql/execution/ColumnarCachedBatchSerializer.scala index 15fd51abef489..7f4235fdf1078 100644 --- a/backends-velox/src/main/scala/org/apache/spark/sql/execution/ColumnarCachedBatchSerializer.scala +++ b/backends-velox/src/main/scala/org/apache/spark/sql/execution/ColumnarCachedBatchSerializer.scala @@ -19,16 +19,15 @@ package org.apache.spark.sql.execution import org.apache.gluten.GlutenConfig import org.apache.gluten.backendsapi.BackendsApiManager import org.apache.gluten.columnarbatch.ColumnarBatches -import org.apache.gluten.exec.Runtimes import org.apache.gluten.execution.{RowToVeloxColumnarExec, VeloxColumnarToRowExec} import org.apache.gluten.memory.arrow.alloc.ArrowBufferAllocators +import org.apache.gluten.runtime.Runtimes import org.apache.gluten.utils.ArrowAbiUtil import org.apache.gluten.utils.iterator.Iterators import org.apache.gluten.vectorized.ColumnarBatchSerializerJniWrapper import org.apache.spark.internal.Logging import org.apache.spark.rdd.RDD -import org.apache.spark.sql.SparkSession import org.apache.spark.sql.catalyst.{InternalRow, SQLConfHelper} import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression} import org.apache.spark.sql.columnar.{CachedBatch, CachedBatchSerializer} @@ -134,22 +133,9 @@ class ColumnarCachedBatchSerializer extends CachedBatchSerializer with SQLConfHe conf) } - // note, these metrics are unused but just make `RowToVeloxColumnarExec` happy - val metrics = BackendsApiManager.getMetricsApiInstance.genRowToColumnarMetrics( - SparkSession.getActiveSession.orNull.sparkContext) - val numInputRows = metrics("numInputRows") - val numOutputBatches = metrics("numOutputBatches") - val convertTime = metrics("convertTime") val numRows = conf.columnBatchSize val rddColumnarBatch = input.mapPartitions { - it => - RowToVeloxColumnarExec.toColumnarBatchIterator( - it, - localSchema, - numInputRows, - numOutputBatches, - convertTime, - numRows) + it => RowToVeloxColumnarExec.toColumnarBatchIterator(it, localSchema, numRows) } convertColumnarBatchToCachedBatch(rddColumnarBatch, schema, storageLevel, conf) } @@ -169,22 +155,10 @@ class ColumnarCachedBatchSerializer extends CachedBatchSerializer with SQLConfHe conf) } - // note, these metrics are unused but just make `VeloxColumnarToRowExec` happy - val metrics = BackendsApiManager.getMetricsApiInstance.genColumnarToRowMetrics( - SparkSession.getActiveSession.orNull.sparkContext) - val numOutputRows = metrics("numOutputRows") - val numInputBatches = metrics("numInputBatches") - val convertTime = metrics("convertTime") val rddColumnarBatch = convertCachedBatchToColumnarBatch(input, cacheAttributes, selectedAttributes, conf) rddColumnarBatch.mapPartitions { - it => - VeloxColumnarToRowExec.toRowIterator( - it, - selectedAttributes, - numOutputRows, - numInputBatches, - convertTime) + it => VeloxColumnarToRowExec.toRowIterator(it, selectedAttributes) } } diff --git a/gluten-core/src/main/scala/org/apache/spark/sql/execution/SparkWriteFilesCommitProtocol.scala b/backends-velox/src/main/scala/org/apache/spark/sql/execution/SparkWriteFilesCommitProtocol.scala similarity index 82% rename from gluten-core/src/main/scala/org/apache/spark/sql/execution/SparkWriteFilesCommitProtocol.scala rename to backends-velox/src/main/scala/org/apache/spark/sql/execution/SparkWriteFilesCommitProtocol.scala index 5e3ab83e32e77..845f2f98fb8c8 100644 --- a/gluten-core/src/main/scala/org/apache/spark/sql/execution/SparkWriteFilesCommitProtocol.scala +++ b/backends-velox/src/main/scala/org/apache/spark/sql/execution/SparkWriteFilesCommitProtocol.scala @@ -18,7 +18,7 @@ package org.apache.spark.sql.execution import org.apache.spark.TaskContext import org.apache.spark.internal.Logging -import org.apache.spark.internal.io.{FileCommitProtocol, FileNameSpec, HadoopMapReduceCommitProtocol} +import org.apache.spark.internal.io.{FileCommitProtocol, HadoopMapReduceCommitProtocol} import org.apache.spark.sql.execution.datasources.WriteJobDescription import org.apache.spark.util.Utils @@ -41,9 +41,9 @@ class SparkWriteFilesCommitProtocol( extends Logging { assert(committer.isInstanceOf[HadoopMapReduceCommitProtocol]) - val sparkStageId: Int = TaskContext.get().stageId() - val sparkPartitionId: Int = TaskContext.get().partitionId() - private val sparkAttemptNumber = TaskContext.get().taskAttemptId().toInt & Int.MaxValue + val sparkStageId = TaskContext.get().stageId() + val sparkPartitionId = TaskContext.get().partitionId() + val sparkAttemptNumber = TaskContext.get().taskAttemptId().toInt & Int.MaxValue private val jobId = createJobID(jobTrackerID, sparkStageId) private val taskId = new TaskID(jobId, TaskType.MAP, sparkPartitionId) @@ -68,21 +68,6 @@ class SparkWriteFilesCommitProtocol( field.get(committer).asInstanceOf[OutputCommitter] } - private lazy val internalGetFilename = { - val m = classOf[HadoopMapReduceCommitProtocol] - .getDeclaredMethod("getFilename", classOf[TaskAttemptContext], classOf[FileNameSpec]) - m.setAccessible(true) - m - } - - def getFilename: String = { - val fileCounter = 0 - val suffix = f".c$fileCounter%03d" + - description.outputWriterFactory.getFileExtension(taskAttemptContext) - val fileNameSpec = FileNameSpec("", suffix) - internalGetFilename.invoke(committer, taskAttemptContext, fileNameSpec).asInstanceOf[String] - } - def setupTask(): Unit = { committer.setupTask(taskAttemptContext) } diff --git a/backends-velox/src/main/scala/org/apache/spark/sql/execution/VeloxBackendWrite.scala b/backends-velox/src/main/scala/org/apache/spark/sql/execution/VeloxBackendWrite.scala deleted file mode 100644 index 5d47aff04d591..0000000000000 --- a/backends-velox/src/main/scala/org/apache/spark/sql/execution/VeloxBackendWrite.scala +++ /dev/null @@ -1,138 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.spark.sql.execution - -import org.apache.gluten.columnarbatch.ColumnarBatches -import org.apache.gluten.memory.arrow.alloc.ArrowBufferAllocators - -import org.apache.spark.internal.Logging -import org.apache.spark.internal.io.FileCommitProtocol.TaskCommitMessage -import org.apache.spark.sql.catalyst.expressions.GenericInternalRow -import org.apache.spark.sql.execution.datasources._ -import org.apache.spark.sql.vectorized.ColumnarBatch - -import com.fasterxml.jackson.databind.ObjectMapper -import com.fasterxml.jackson.module.scala.DefaultScalaModule - -import scala.collection.mutable - -// Velox write files metrics start -// -// Follows the code in velox `HiveDataSink::close()` -// The json can be as following: -// { -// "inMemoryDataSizeInBytes":0, -// "containsNumberedFileNames":true, -// "onDiskDataSizeInBytes":307, -// "fileWriteInfos":[ -// { -// "fileSize":307, -// "writeFileName": -// "Gluten_Stage_1_TID_2_0_2_d1db3b31-4f99-41cb-a4e7-3b8607506168.parquet", -// "targetFileName": -// "Gluten_Stage_1_TID_2_0_2_d1db3b31-4f99-41cb-a4e7-3b8607506168.parquet" -// } -// ], -// "writePath":"file:/home/gluten/spark-warehouse/inserttable/part1=1/part2=1", -// "rowCount":1, -// "targetPath":"file:/home/gluten/spark-warehouse/inserttable/part1=1/part2=1", -// "updateMode":"NEW", -// "name":"part1=1/part2=1" -// } -case class VeloxWriteFilesInfo(writeFileName: String, targetFileName: String, fileSize: Long) - -case class VeloxWriteFilesMetrics( - name: String, - updateMode: String, - writePath: String, - targetPath: String, - fileWriteInfos: Seq[VeloxWriteFilesInfo], - rowCount: Long, - inMemoryDataSizeInBytes: Long, - onDiskDataSizeInBytes: Long, - containsNumberedFileNames: Boolean) - -// Velox write files metrics end - -case class VeloxBackendWrite(description: WriteJobDescription) extends BackendWrite with Logging { - - override def collectNativeWriteFilesMetrics(cb: ColumnarBatch): Option[WriteTaskResult] = { - // Currently, the cb contains three columns: row, fragments, and context. - // The first row in the row column contains the number of written numRows. - // The fragments column contains detailed information about the file writes. - val loadedCb = ColumnarBatches.ensureLoaded(ArrowBufferAllocators.contextInstance, cb) - assert(loadedCb.numCols() == 3) - val numWrittenRows = loadedCb.column(0).getLong(0) - - var updatedPartitions = Set.empty[String] - val addedAbsPathFiles: mutable.Map[String, String] = mutable.Map[String, String]() - var numBytes = 0L - val objectMapper = new ObjectMapper() - objectMapper.registerModule(DefaultScalaModule) - for (i <- 0 until loadedCb.numRows() - 1) { - val fragments = loadedCb.column(1).getUTF8String(i + 1) - val metrics = objectMapper - .readValue(fragments.toString.getBytes("UTF-8"), classOf[VeloxWriteFilesMetrics]) - logDebug(s"Velox write files metrics: $metrics") - - val fileWriteInfos = metrics.fileWriteInfos - assert(fileWriteInfos.length == 1) - val fileWriteInfo = fileWriteInfos.head - numBytes += fileWriteInfo.fileSize - val targetFileName = fileWriteInfo.targetFileName - val outputPath = description.path - - // part1=1/part2=1 - val partitionFragment = metrics.name - // Write a partitioned table - if (partitionFragment != "") { - updatedPartitions += partitionFragment - val tmpOutputPath = outputPath + "/" + partitionFragment + "/" + targetFileName - val customOutputPath = description.customPartitionLocations.get( - PartitioningUtils.parsePathFragment(partitionFragment)) - if (customOutputPath.isDefined) { - addedAbsPathFiles(tmpOutputPath) = customOutputPath.get + "/" + targetFileName - } - } - } - - val numFiles = loadedCb.numRows() - 1 - val partitionsInternalRows = updatedPartitions.map { - part => - val parts = new Array[Any](1) - parts(0) = part - new GenericInternalRow(parts) - }.toSeq - val stats = BasicWriteTaskStats( - partitions = partitionsInternalRows, - numFiles = numFiles, - numBytes = numBytes, - numRows = numWrittenRows) - val summary = - ExecutedWriteSummary(updatedPartitions = updatedPartitions, stats = Seq(stats)) - - // Write an empty iterator - if (numFiles == 0) { - None - } else { - Some( - WriteTaskResult( - new TaskCommitMessage(addedAbsPathFiles.toMap -> updatedPartitions), - summary)) - } - } -} diff --git a/backends-velox/src/main/scala/org/apache/spark/sql/execution/VeloxColumnarWriteFilesExec.scala b/backends-velox/src/main/scala/org/apache/spark/sql/execution/VeloxColumnarWriteFilesExec.scala new file mode 100644 index 0000000000000..c339014c5e7cf --- /dev/null +++ b/backends-velox/src/main/scala/org/apache/spark/sql/execution/VeloxColumnarWriteFilesExec.scala @@ -0,0 +1,280 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.sql.execution + +import org.apache.gluten.backendsapi.BackendsApiManager +import org.apache.gluten.columnarbatch.ColumnarBatches +import org.apache.gluten.memory.arrow.alloc.ArrowBufferAllocators + +import org.apache.spark.{Partition, SparkException, TaskContext, TaskOutputFileAlreadyExistException} +import org.apache.spark.internal.io.{FileCommitProtocol, SparkHadoopWriterUtils} +import org.apache.spark.internal.io.FileCommitProtocol.TaskCommitMessage +import org.apache.spark.rdd.RDD +import org.apache.spark.shuffle.FetchFailedException +import org.apache.spark.sql.catalyst.catalog.BucketSpec +import org.apache.spark.sql.catalyst.catalog.CatalogTypes.TablePartitionSpec +import org.apache.spark.sql.catalyst.expressions.{Attribute, GenericInternalRow} +import org.apache.spark.sql.connector.write.WriterCommitMessage +import org.apache.spark.sql.execution.datasources._ +import org.apache.spark.sql.internal.SQLConf +import org.apache.spark.sql.vectorized.ColumnarBatch +import org.apache.spark.util.Utils + +import com.fasterxml.jackson.databind.ObjectMapper +import com.fasterxml.jackson.module.scala.DefaultScalaModule +import org.apache.hadoop.fs.FileAlreadyExistsException + +import java.util.Date + +import scala.collection.mutable + +// Velox write files metrics start +// +// Follows the code in velox `HiveDataSink::close()` +// The json can be as following: +// { +// "inMemoryDataSizeInBytes":0, +// "containsNumberedFileNames":true, +// "onDiskDataSizeInBytes":307, +// "fileWriteInfos":[ +// { +// "fileSize":307, +// "writeFileName": +// "Gluten_Stage_1_TID_2_0_2_d1db3b31-4f99-41cb-a4e7-3b8607506168.parquet", +// "targetFileName": +// "Gluten_Stage_1_TID_2_0_2_d1db3b31-4f99-41cb-a4e7-3b8607506168.parquet" +// } +// ], +// "writePath":"file:/home/gluten/spark-warehouse/inserttable/part1=1/part2=1", +// "rowCount":1, +// "targetPath":"file:/home/gluten/spark-warehouse/inserttable/part1=1/part2=1", +// "updateMode":"NEW", +// "name":"part1=1/part2=1" +// } +case class VeloxWriteFilesInfo(writeFileName: String, targetFileName: String, fileSize: Long) + +case class VeloxWriteFilesMetrics( + name: String, + updateMode: String, + writePath: String, + targetPath: String, + fileWriteInfos: Seq[VeloxWriteFilesInfo], + rowCount: Long, + inMemoryDataSizeInBytes: Long, + onDiskDataSizeInBytes: Long, + containsNumberedFileNames: Boolean) + +// Velox write files metrics end + +/** + * This RDD is used to make sure we have injected staging write path before initializing the native + * plan, and support Spark file commit protocol. + */ +class VeloxColumnarWriteFilesRDD( + var prev: RDD[ColumnarBatch], + description: WriteJobDescription, + committer: FileCommitProtocol, + jobTrackerID: String) + extends RDD[WriterCommitMessage](prev) { + + private def collectNativeWriteFilesMetrics(cb: ColumnarBatch): Option[WriteTaskResult] = { + // Currently, the cb contains three columns: row, fragments, and context. + // The first row in the row column contains the number of written numRows. + // The fragments column contains detailed information about the file writes. + val loadedCb = ColumnarBatches.ensureLoaded(ArrowBufferAllocators.contextInstance, cb) + assert(loadedCb.numCols() == 3) + val numWrittenRows = loadedCb.column(0).getLong(0) + + var updatedPartitions = Set.empty[String] + val addedAbsPathFiles: mutable.Map[String, String] = mutable.Map[String, String]() + var numBytes = 0L + val objectMapper = new ObjectMapper() + objectMapper.registerModule(DefaultScalaModule) + for (i <- 0 until loadedCb.numRows() - 1) { + val fragments = loadedCb.column(1).getUTF8String(i + 1) + val metrics = objectMapper + .readValue(fragments.toString.getBytes("UTF-8"), classOf[VeloxWriteFilesMetrics]) + logDebug(s"Velox write files metrics: $metrics") + + val fileWriteInfos = metrics.fileWriteInfos + assert(fileWriteInfos.length == 1) + val fileWriteInfo = fileWriteInfos.head + numBytes += fileWriteInfo.fileSize + val targetFileName = fileWriteInfo.targetFileName + val outputPath = description.path + + // part1=1/part2=1 + val partitionFragment = metrics.name + // Write a partitioned table + if (partitionFragment != "") { + updatedPartitions += partitionFragment + val tmpOutputPath = outputPath + "/" + partitionFragment + "/" + targetFileName + val customOutputPath = description.customPartitionLocations.get( + PartitioningUtils.parsePathFragment(partitionFragment)) + if (customOutputPath.isDefined) { + addedAbsPathFiles(tmpOutputPath) = customOutputPath.get + "/" + targetFileName + } + } + } + + val numFiles = loadedCb.numRows() - 1 + val partitionsInternalRows = updatedPartitions.map { + part => + val parts = new Array[Any](1) + parts(0) = part + new GenericInternalRow(parts) + }.toSeq + val stats = BasicWriteTaskStats( + partitions = partitionsInternalRows, + numFiles = numFiles, + numBytes = numBytes, + numRows = numWrittenRows) + val summary = + ExecutedWriteSummary(updatedPartitions = updatedPartitions, stats = Seq(stats)) + + // Write an empty iterator + if (numFiles == 0) { + None + } else { + Some( + WriteTaskResult( + new TaskCommitMessage(addedAbsPathFiles.toMap -> updatedPartitions), + summary)) + } + } + + private def reportTaskMetrics(writeTaskResult: WriteTaskResult): Unit = { + val stats = writeTaskResult.summary.stats.head.asInstanceOf[BasicWriteTaskStats] + val (numBytes, numWrittenRows) = (stats.numBytes, stats.numRows) + // Reports bytesWritten and recordsWritten to the Spark output metrics. + // We should update it after calling `commitTask` to overwrite the metrics. + Option(TaskContext.get()).map(_.taskMetrics().outputMetrics).foreach { + outputMetrics => + outputMetrics.setBytesWritten(numBytes) + outputMetrics.setRecordsWritten(numWrittenRows) + } + } + + private def writeFilesForEmptyIterator( + commitProtocol: SparkWriteFilesCommitProtocol): WriteTaskResult = { + val taskAttemptContext = commitProtocol.taskAttemptContext + + val dataWriter = + if (commitProtocol.sparkPartitionId != 0) { + // In case of empty job, leave first partition to save meta for file format like parquet. + new EmptyDirectoryDataWriter(description, taskAttemptContext, committer) + } else if (description.partitionColumns.isEmpty) { + new SingleDirectoryDataWriter(description, taskAttemptContext, committer) + } else { + new DynamicPartitionDataSingleWriter(description, taskAttemptContext, committer) + } + + // We have done `setupTask` outside + dataWriter.writeWithIterator(Iterator.empty) + dataWriter.commit() + } + + override def compute(split: Partition, context: TaskContext): Iterator[WriterCommitMessage] = { + val commitProtocol = new SparkWriteFilesCommitProtocol(jobTrackerID, description, committer) + + commitProtocol.setupTask() + val writePath = commitProtocol.newTaskAttemptTempPath() + logDebug(s"Velox staging write path: $writePath") + var writeTaskResult: WriteTaskResult = null + try { + Utils.tryWithSafeFinallyAndFailureCallbacks(block = { + BackendsApiManager.getIteratorApiInstance.injectWriteFilesTempPath(writePath, "") + + // Initialize the native plan + val iter = firstParent[ColumnarBatch].iterator(split, context) + assert(iter.hasNext) + val resultColumnarBatch = iter.next() + assert(resultColumnarBatch != null) + val nativeWriteTaskResult = collectNativeWriteFilesMetrics(resultColumnarBatch) + if (nativeWriteTaskResult.isEmpty) { + // If we are writing an empty iterator, then velox would do nothing. + // Here we fallback to use vanilla Spark write files to generate an empty file for + // metadata only. + writeTaskResult = writeFilesForEmptyIterator(commitProtocol) + // We have done commit task inside `writeFilesForEmptyIterator`. + } else { + writeTaskResult = nativeWriteTaskResult.get + commitProtocol.commitTask() + } + })( + catchBlock = { + // If there is an error, abort the task + commitProtocol.abortTask() + logError(s"Job ${commitProtocol.getJobId} aborted.") + } + ) + } catch { + case e: FetchFailedException => + throw e + case f: FileAlreadyExistsException if SQLConf.get.fastFailFileFormatOutput => + throw new TaskOutputFileAlreadyExistException(f) + case t: Throwable => + throw new SparkException( + s"Task failed while writing rows to staging path: $writePath, " + + s"output path: ${description.path}", + t) + } + + assert(writeTaskResult != null) + reportTaskMetrics(writeTaskResult) + Iterator.single(writeTaskResult) + } + + override protected def getPartitions: Array[Partition] = firstParent[ColumnarBatch].partitions + + override def clearDependencies(): Unit = { + super.clearDependencies() + prev = null + } +} + +case class VeloxColumnarWriteFilesExec private ( + override val left: SparkPlan, + override val right: SparkPlan, + fileFormat: FileFormat, + partitionColumns: Seq[Attribute], + bucketSpec: Option[BucketSpec], + options: Map[String, String], + staticPartitions: TablePartitionSpec) + extends ColumnarWriteFilesExec(left, right) { + + override def doExecuteWrite(writeFilesSpec: WriteFilesSpec): RDD[WriterCommitMessage] = { + assert(child.supportsColumnar) + + val rdd = child.executeColumnar() + val jobTrackerID = SparkHadoopWriterUtils.createJobTrackerID(new Date()) + val description = writeFilesSpec.description + val committer = writeFilesSpec.committer + if (rdd.partitions.length == 0) { + // SPARK-23271 If we are attempting to write a zero partition rdd, create a dummy single + // partition rdd to make sure we at least set up one write task to write the metadata. + writeFilesForEmptyRDD(description, committer, jobTrackerID) + } else { + new VeloxColumnarWriteFilesRDD(rdd, description, committer, jobTrackerID) + } + } + + override protected def withNewChildrenInternal( + newLeft: SparkPlan, + newRight: SparkPlan): SparkPlan = + copy(newLeft, newRight, fileFormat, partitionColumns, bucketSpec, options, staticPartitions) +} diff --git a/backends-velox/src/main/scala/org/apache/spark/sql/execution/datasources/velox/VeloxFormatWriterInjects.scala b/backends-velox/src/main/scala/org/apache/spark/sql/execution/datasources/velox/VeloxFormatWriterInjects.scala index 7da4da5f0784a..e25d3c663d698 100644 --- a/backends-velox/src/main/scala/org/apache/spark/sql/execution/datasources/velox/VeloxFormatWriterInjects.scala +++ b/backends-velox/src/main/scala/org/apache/spark/sql/execution/datasources/velox/VeloxFormatWriterInjects.scala @@ -19,9 +19,9 @@ package org.apache.spark.sql.execution.datasources.velox import org.apache.gluten.columnarbatch.{ColumnarBatches, ColumnarBatchJniWrapper} import org.apache.gluten.datasource.DatasourceJniWrapper import org.apache.gluten.exception.GlutenException -import org.apache.gluten.exec.Runtimes import org.apache.gluten.execution.datasource.GlutenRowSplitter import org.apache.gluten.memory.arrow.alloc.ArrowBufferAllocators +import org.apache.gluten.runtime.Runtimes import org.apache.gluten.utils.{ArrowAbiUtil, DatasourceUtil} import org.apache.spark.sql.SparkSession diff --git a/backends-velox/src/main/scala/org/apache/spark/sql/expression/UDFResolver.scala b/backends-velox/src/main/scala/org/apache/spark/sql/expression/UDFResolver.scala index 99f9faf9914a0..ab83c55ee3069 100644 --- a/backends-velox/src/main/scala/org/apache/spark/sql/expression/UDFResolver.scala +++ b/backends-velox/src/main/scala/org/apache/spark/sql/expression/UDFResolver.scala @@ -27,11 +27,12 @@ import org.apache.spark.deploy.SparkHadoopUtil import org.apache.spark.internal.Logging import org.apache.spark.sql.catalyst.{FunctionIdentifier, InternalRow} import org.apache.spark.sql.catalyst.analysis.FunctionRegistry.FunctionBuilder -import org.apache.spark.sql.catalyst.expressions.{AttributeReference, Expression, ExpressionInfo, Unevaluable} +import org.apache.spark.sql.catalyst.expressions.{AttributeReference, Cast, Expression, ExpressionInfo, Unevaluable} import org.apache.spark.sql.catalyst.expressions.aggregate.AggregateFunction import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, ExprCode} import org.apache.spark.sql.catalyst.types.DataTypeUtils import org.apache.spark.sql.errors.QueryExecutionErrors +import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types.{DataType, StructField, StructType} import org.apache.spark.util.Utils @@ -74,18 +75,21 @@ trait UDFSignatureBase { val expressionType: ExpressionType val children: Seq[DataType] val variableArity: Boolean + val allowTypeConversion: Boolean } case class UDFSignature( expressionType: ExpressionType, children: Seq[DataType], - variableArity: Boolean) + variableArity: Boolean, + allowTypeConversion: Boolean) extends UDFSignatureBase case class UDAFSignature( expressionType: ExpressionType, children: Seq[DataType], variableArity: Boolean, + allowTypeConversion: Boolean, intermediateAttrs: Seq[AttributeReference]) extends UDFSignatureBase @@ -116,12 +120,12 @@ case class UDFExpression( object UDFResolver extends Logging { private val UDFNames = mutable.HashSet[String]() // (udf_name, arg1, arg2, ...) => return type - private val UDFMap = mutable.HashMap[String, mutable.MutableList[UDFSignature]]() + private val UDFMap = mutable.HashMap[String, mutable.ListBuffer[UDFSignature]]() private val UDAFNames = mutable.HashSet[String]() // (udaf_name, arg1, arg2, ...) => return type, intermediate attributes private val UDAFMap = - mutable.HashMap[String, mutable.MutableList[UDAFSignature]]() + mutable.HashMap[String, mutable.ListBuffer[UDAFSignature]]() private val LIB_EXTENSION = ".so" @@ -130,26 +134,30 @@ object UDFResolver extends Logging { name: String, returnType: Array[Byte], argTypes: Array[Byte], - variableArity: Boolean): Unit = { + variableArity: Boolean, + allowTypeConversion: Boolean): Unit = { registerUDF( name, ConverterUtils.parseFromBytes(returnType), ConverterUtils.parseFromBytes(argTypes), - variableArity) + variableArity, + allowTypeConversion) } private def registerUDF( name: String, returnType: ExpressionType, argTypes: ExpressionType, - variableArity: Boolean): Unit = { + variableArity: Boolean, + allowTypeConversion: Boolean): Unit = { assert(argTypes.dataType.isInstanceOf[StructType]) val v = - UDFMap.getOrElseUpdate(name, mutable.MutableList[UDFSignature]()) + UDFMap.getOrElseUpdate(name, mutable.ListBuffer[UDFSignature]()) v += UDFSignature( returnType, argTypes.dataType.asInstanceOf[StructType].fields.map(_.dataType), - variableArity) + variableArity, + allowTypeConversion) UDFNames += name logInfo(s"Registered UDF: $name($argTypes) -> $returnType") } @@ -159,13 +167,15 @@ object UDFResolver extends Logging { returnType: Array[Byte], argTypes: Array[Byte], intermediateTypes: Array[Byte], - variableArity: Boolean): Unit = { + variableArity: Boolean, + enableTypeConversion: Boolean): Unit = { registerUDAF( name, ConverterUtils.parseFromBytes(returnType), ConverterUtils.parseFromBytes(argTypes), ConverterUtils.parseFromBytes(intermediateTypes), - variableArity + variableArity, + enableTypeConversion ) } @@ -174,7 +184,8 @@ object UDFResolver extends Logging { returnType: ExpressionType, argTypes: ExpressionType, intermediateTypes: ExpressionType, - variableArity: Boolean): Unit = { + variableArity: Boolean, + allowTypeConversion: Boolean): Unit = { assert(argTypes.dataType.isInstanceOf[StructType]) val aggBufferAttributes: Seq[AttributeReference] = @@ -189,11 +200,12 @@ object UDFResolver extends Logging { } val v = - UDAFMap.getOrElseUpdate(name, mutable.MutableList[UDAFSignature]()) + UDAFMap.getOrElseUpdate(name, mutable.ListBuffer[UDAFSignature]()) v += UDAFSignature( returnType, argTypes.dataType.asInstanceOf[StructType].fields.map(_.dataType), variableArity, + allowTypeConversion, aggBufferAttributes) UDAFNames += name logInfo(s"Registered UDAF: $name($argTypes) -> $returnType") @@ -346,16 +358,27 @@ object UDFResolver extends Logging { } } + private def checkAllowTypeConversion: Boolean = { + SQLConf.get + .getConfString(VeloxBackendSettings.GLUTEN_VELOX_UDF_ALLOW_TYPE_CONVERSION, "false") + .toBoolean + } + private def getUdfExpression(name: String)(children: Seq[Expression]) = { def errorMessage: String = s"UDF $name -> ${children.map(_.dataType.simpleString).mkString(", ")} is not registered." + val allowTypeConversion = checkAllowTypeConversion val signatures = UDFMap.getOrElse(name, throw new UnsupportedOperationException(errorMessage)); - - signatures.find(sig => tryBind(sig, children.map(_.dataType))) match { + signatures.find(sig => tryBind(sig, children.map(_.dataType), allowTypeConversion)) match { case Some(sig) => - UDFExpression(name, sig.expressionType.dataType, sig.expressionType.nullable, children) + UDFExpression( + name, + sig.expressionType.dataType, + sig.expressionType.nullable, + if (!allowTypeConversion && !sig.allowTypeConversion) children + else applyCast(children, sig)) case None => throw new UnsupportedOperationException(errorMessage) } @@ -365,50 +388,77 @@ object UDFResolver extends Logging { def errorMessage: String = s"UDAF $name -> ${children.map(_.dataType.simpleString).mkString(", ")} is not registered." + val allowTypeConversion = checkAllowTypeConversion val signatures = UDAFMap.getOrElse( name, throw new UnsupportedOperationException(errorMessage) ) - - signatures.find(sig => tryBind(sig, children.map(_.dataType))) match { + signatures.find(sig => tryBind(sig, children.map(_.dataType), allowTypeConversion)) match { case Some(sig) => UserDefinedAggregateFunction( name, sig.expressionType.dataType, sig.expressionType.nullable, - children, - sig.intermediateAttrs) + if (!allowTypeConversion && !sig.allowTypeConversion) children + else applyCast(children, sig), + sig.intermediateAttrs + ) case None => throw new UnsupportedOperationException(errorMessage) } } + private def tryBind( + sig: UDFSignatureBase, + requiredDataTypes: Seq[DataType], + allowTypeConversion: Boolean): Boolean = { + if ( + !tryBindStrict(sig, requiredDataTypes) && (allowTypeConversion || sig.allowTypeConversion) + ) { + tryBindWithTypeConversion(sig, requiredDataTypes) + } else { + true + } + } + // Returns true if required data types match the function signature. // If the function signature is variable arity, the number of the last argument can be zero // or more. - private def tryBind(sig: UDFSignatureBase, requiredDataTypes: Seq[DataType]): Boolean = { + private def tryBindWithTypeConversion( + sig: UDFSignatureBase, + requiredDataTypes: Seq[DataType]): Boolean = { + tryBind0(sig, requiredDataTypes, Cast.canCast) + } + + private def tryBindStrict(sig: UDFSignatureBase, requiredDataTypes: Seq[DataType]): Boolean = { + tryBind0(sig, requiredDataTypes, DataTypeUtils.sameType) + } + + private def tryBind0( + sig: UDFSignatureBase, + requiredDataTypes: Seq[DataType], + checkType: (DataType, DataType) => Boolean): Boolean = { if (!sig.variableArity) { sig.children.size == requiredDataTypes.size && - sig.children - .zip(requiredDataTypes) - .forall { case (candidate, required) => DataTypeUtils.sameType(candidate, required) } + requiredDataTypes + .zip(sig.children) + .forall { case (required, candidate) => checkType(required, candidate) } } else { // If variableArity is true, there must be at least one argument in the signature. if (requiredDataTypes.size < sig.children.size - 1) { false } else if (requiredDataTypes.size == sig.children.size - 1) { - sig.children - .dropRight(1) - .zip(requiredDataTypes) - .forall { case (candidate, required) => DataTypeUtils.sameType(candidate, required) } + requiredDataTypes + .zip(sig.children.dropRight(1)) + .forall { case (required, candidate) => checkType(required, candidate) } } else { val varArgStartIndex = sig.children.size - 1 // First check all var args has the same type with the last argument of the signature. if ( !requiredDataTypes .drop(varArgStartIndex) - .forall(argType => DataTypeUtils.sameType(sig.children.last, argType)) + .forall(argType => checkType(argType, sig.children.last)) ) { false } else if (varArgStartIndex == 0) { @@ -416,11 +466,38 @@ object UDFResolver extends Logging { true } else { // Whether fixed args matches. - sig.children - .dropRight(1) - .zip(requiredDataTypes.dropRight(1 + requiredDataTypes.size - sig.children.size)) - .forall { case (candidate, required) => DataTypeUtils.sameType(candidate, required) } + requiredDataTypes + .dropRight(1 + requiredDataTypes.size - sig.children.size) + .zip(sig.children.dropRight(1)) + .forall { case (required, candidate) => checkType(required, candidate) } + } + } + } + } + + private def applyCast(children: Seq[Expression], sig: UDFSignatureBase): Seq[Expression] = { + def maybeCast(expr: Expression, toType: DataType): Expression = { + if (!expr.dataType.sameType(toType)) { + Cast(expr, toType) + } else { + expr + } + } + + if (!sig.variableArity) { + children.zip(sig.children).map { case (expr, toType) => maybeCast(expr, toType) } + } else { + val fixedArgs = Math.min(children.size, sig.children.size) + val newChildren = children.take(fixedArgs).zip(sig.children.take(fixedArgs)).map { + case (expr, toType) => maybeCast(expr, toType) + } + if (children.size > sig.children.size) { + val varArgType = sig.children.last + newChildren ++ children.takeRight(children.size - sig.children.size).map { + expr => maybeCast(expr, varArgType) } + } else { + newChildren } } } diff --git a/backends-velox/src/test/java/org/apache/gluten/columnarbatch/ColumnarBatchTest.java b/backends-velox/src/test/java/org/apache/gluten/columnarbatch/ColumnarBatchTest.java index cd2ac50d350c3..54994ccd48363 100644 --- a/backends-velox/src/test/java/org/apache/gluten/columnarbatch/ColumnarBatchTest.java +++ b/backends-velox/src/test/java/org/apache/gluten/columnarbatch/ColumnarBatchTest.java @@ -16,10 +16,12 @@ */ package org.apache.gluten.columnarbatch; +import org.apache.gluten.execution.RowToVeloxColumnarExec; import org.apache.gluten.memory.arrow.alloc.ArrowBufferAllocators; import org.apache.gluten.test.VeloxBackendTestBase; import org.apache.gluten.vectorized.ArrowWritableColumnVector; +import org.apache.spark.sql.types.DataTypes; import org.apache.spark.sql.types.StructType; import org.apache.spark.sql.vectorized.ColumnarBatch; import org.apache.spark.util.TaskResources$; @@ -30,6 +32,8 @@ import java.util.Spliterators; import java.util.stream.StreamSupport; +import scala.collection.JavaConverters; + public class ColumnarBatchTest extends VeloxBackendTestBase { @Test @@ -91,6 +95,67 @@ public void testCreateByHandle() { }); } + @Test + public void testOffloadAndLoadReadRow() { + TaskResources$.MODULE$.runUnsafe( + () -> { + final int numRows = 20; + final ColumnarBatch batch = newArrowBatch("a boolean, b int", numRows); + final ArrowWritableColumnVector col0 = (ArrowWritableColumnVector) batch.column(0); + final ArrowWritableColumnVector col1 = (ArrowWritableColumnVector) batch.column(1); + for (int j = 0; j < numRows; j++) { + col0.putBoolean(j, j % 2 == 0); + col1.putInt(j, 15 - j); + } + col1.putNull(numRows - 1); + Assert.assertTrue(ColumnarBatches.isHeavyBatch(batch)); + final ColumnarBatch offloaded = + ColumnarBatches.ensureOffloaded(ArrowBufferAllocators.contextInstance(), batch); + Assert.assertTrue(ColumnarBatches.isLightBatch(offloaded)); + final ColumnarBatch loaded = + ColumnarBatches.ensureLoaded(ArrowBufferAllocators.contextInstance(), offloaded); + Assert.assertTrue(ColumnarBatches.isHeavyBatch(loaded)); + long cnt = + StreamSupport.stream( + Spliterators.spliteratorUnknownSize( + loaded.rowIterator(), Spliterator.ORDERED), + false) + .count(); + Assert.assertEquals(numRows, cnt); + Assert.assertEquals(loaded.getRow(0).getInt(1), 15); + loaded.close(); + return null; + }); + } + + @Test + public void testToString() { + TaskResources$.MODULE$.runUnsafe( + () -> { + final int numRows = 20; + final ColumnarBatch batch = newArrowBatch("a boolean, b int", numRows); + final ArrowWritableColumnVector col0 = (ArrowWritableColumnVector) batch.column(0); + final ArrowWritableColumnVector col1 = (ArrowWritableColumnVector) batch.column(1); + for (int j = 0; j < numRows; j++) { + col0.putBoolean(j, j % 2 == 0); + col1.putInt(j, 15 - j); + } + col1.putNull(numRows - 1); + StructType structType = new StructType(); + structType = structType.add("a", DataTypes.BooleanType, true); + structType = structType.add("b", DataTypes.IntegerType, true); + ColumnarBatch veloxBatch = + RowToVeloxColumnarExec.toColumnarBatchIterator( + JavaConverters.asScalaIterator(batch.rowIterator()), structType, numRows) + .next(); + Assert.assertEquals("[true,15]\n[false,14]", ColumnarBatches.toString(veloxBatch, 0, 2)); + Assert.assertEquals( + "[true,-3]\n[false,null]", ColumnarBatches.toString(veloxBatch, 18, 2)); + veloxBatch.close(); + return null; + }); + } + private static ColumnarBatch newArrowBatch(String schema, int numRows) { final ArrowWritableColumnVector[] columns = ArrowWritableColumnVector.allocateColumns(numRows, StructType.fromDDL(schema)); diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark32/1.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark32/1.txt index deb09c6c1c0f6..37622c9e13a75 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark32/1.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark32/1.txt @@ -64,7 +64,7 @@ Arguments: X, X (8) ColumnarExchange Input [18]: [hash_partition_key#X, l_returnflag#X, l_linestatus#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, count#X, sum#X, count#X, sum#X, count#X, count#X] -Arguments: hashpartitioning(l_returnflag#X, l_linestatus#X, 1), ENSURE_REQUIREMENTS, [l_returnflag#X, l_linestatus#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, count#X, sum#X, count#X, sum#X, count#X, count#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(l_returnflag#X, l_linestatus#X, 1), ENSURE_REQUIREMENTS, [l_returnflag#X, l_linestatus#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, count#X, sum#X, count#X, sum#X, count#X, count#X], [plan_id=X], [shuffle_writer_type=hash] (9) ShuffleQueryStage Output [17]: [l_returnflag#X, l_linestatus#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, count#X, sum#X, count#X, sum#X, count#X, count#X] @@ -93,7 +93,7 @@ Arguments: X, X (15) ColumnarExchange Input [10]: [l_returnflag#X, l_linestatus#X, sum_qty#X, sum_base_price#X, sum_disc_price#X, sum_charge#X, avg_qty#X, avg_price#X, avg_disc#X, count_order#X] -Arguments: rangepartitioning(l_returnflag#X ASC NULLS FIRST, l_linestatus#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] +Arguments: rangepartitioning(l_returnflag#X ASC NULLS FIRST, l_linestatus#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [shuffle_writer_type=hash] (16) ShuffleQueryStage Output [10]: [l_returnflag#X, l_linestatus#X, sum_qty#X, sum_base_price#X, sum_disc_price#X, sum_charge#X, avg_qty#X, avg_price#X, avg_disc#X, count_order#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark32/10.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark32/10.txt index 2b4b7266faa67..10c654689b56e 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark32/10.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark32/10.txt @@ -216,7 +216,7 @@ Arguments: X, X (36) ColumnarExchange Input [10]: [hash_partition_key#X, c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, sum#X, isEmpty#X] -Arguments: hashpartitioning(c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, sum#X, isEmpty#X], [plan_id=X], [shuffle_writer_type=hash] (37) ShuffleQueryStage Output [9]: [c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, sum#X, isEmpty#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark32/11.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark32/11.txt index 356e7b09de8ef..8289ecd8da6c1 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark32/11.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark32/11.txt @@ -166,7 +166,7 @@ Arguments: X, X (26) ColumnarExchange Input [4]: [hash_partition_key#X, ps_partkey#X, sum#X, isEmpty#X] -Arguments: hashpartitioning(ps_partkey#X, 1), ENSURE_REQUIREMENTS, [ps_partkey#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(ps_partkey#X, 1), ENSURE_REQUIREMENTS, [ps_partkey#X, sum#X, isEmpty#X], [plan_id=X], [shuffle_writer_type=hash] (27) ShuffleQueryStage Output [3]: [ps_partkey#X, sum#X, isEmpty#X] @@ -199,7 +199,7 @@ Arguments: X, X (34) ColumnarExchange Input [2]: [ps_partkey#X, value#X] -Arguments: rangepartitioning(value#X DESC NULLS LAST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] +Arguments: rangepartitioning(value#X DESC NULLS LAST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [shuffle_writer_type=hash] (35) ShuffleQueryStage Output [2]: [ps_partkey#X, value#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark32/12.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark32/12.txt index 824d4e33aeb72..cb6c195e35544 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark32/12.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark32/12.txt @@ -114,7 +114,7 @@ Arguments: X, X (17) ColumnarExchange Input [4]: [hash_partition_key#X, l_shipmode#X, sum#X, sum#X] -Arguments: hashpartitioning(l_shipmode#X, 1), ENSURE_REQUIREMENTS, [l_shipmode#X, sum#X, sum#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(l_shipmode#X, 1), ENSURE_REQUIREMENTS, [l_shipmode#X, sum#X, sum#X], [plan_id=X], [shuffle_writer_type=hash] (18) ShuffleQueryStage Output [3]: [l_shipmode#X, sum#X, sum#X] @@ -143,7 +143,7 @@ Arguments: X, X (24) ColumnarExchange Input [3]: [l_shipmode#X, high_line_count#X, low_line_count#X] -Arguments: rangepartitioning(l_shipmode#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] +Arguments: rangepartitioning(l_shipmode#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [shuffle_writer_type=hash] (25) ShuffleQueryStage Output [3]: [l_shipmode#X, high_line_count#X, low_line_count#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark32/13.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark32/13.txt index 0387f9bdbd1c2..2af2ba8388006 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark32/13.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark32/13.txt @@ -118,7 +118,7 @@ Arguments: X, X (16) ColumnarExchange Input [3]: [hash_partition_key#X, c_custkey#X, count#X] -Arguments: hashpartitioning(c_custkey#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X, count#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(c_custkey#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X, count#X], [plan_id=X], [shuffle_writer_type=hash] (17) ShuffleQueryStage Output [2]: [c_custkey#X, count#X] @@ -162,7 +162,7 @@ Arguments: X, X (26) ColumnarExchange Input [3]: [hash_partition_key#X, c_count#X, count#X] -Arguments: hashpartitioning(c_count#X, 1), ENSURE_REQUIREMENTS, [c_count#X, count#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(c_count#X, 1), ENSURE_REQUIREMENTS, [c_count#X, count#X], [plan_id=X], [shuffle_writer_type=hash] (27) ShuffleQueryStage Output [2]: [c_count#X, count#X] @@ -191,7 +191,7 @@ Arguments: X, X (33) ColumnarExchange Input [2]: [c_count#X, custdist#X] -Arguments: rangepartitioning(custdist#X DESC NULLS LAST, c_count#X DESC NULLS LAST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] +Arguments: rangepartitioning(custdist#X DESC NULLS LAST, c_count#X DESC NULLS LAST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [shuffle_writer_type=hash] (34) ShuffleQueryStage Output [2]: [c_count#X, custdist#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark32/14.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark32/14.txt index 354637343992f..4a18ef6fee831 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark32/14.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark32/14.txt @@ -103,7 +103,7 @@ Arguments: X, X (16) ColumnarExchange Input [4]: [sum#X, isEmpty#X, sum#X, isEmpty#X] -Arguments: SinglePartition, ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] +Arguments: SinglePartition, ENSURE_REQUIREMENTS, [plan_id=X], [shuffle_writer_type=hash] (17) ShuffleQueryStage Output [4]: [sum#X, isEmpty#X, sum#X, isEmpty#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark32/15.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark32/15.txt index 2e51e95f07611..ea207bd018d61 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark32/15.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark32/15.txt @@ -107,7 +107,7 @@ Arguments: X, X (15) ColumnarExchange Input [4]: [hash_partition_key#X, l_suppkey#X, sum#X, isEmpty#X] -Arguments: hashpartitioning(l_suppkey#X, 1), ENSURE_REQUIREMENTS, [l_suppkey#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(l_suppkey#X, 1), ENSURE_REQUIREMENTS, [l_suppkey#X, sum#X, isEmpty#X], [plan_id=X], [shuffle_writer_type=hash] (16) ShuffleQueryStage Output [3]: [l_suppkey#X, sum#X, isEmpty#X] @@ -149,7 +149,7 @@ Arguments: X, X (25) ColumnarExchange Input [5]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X, total_revenue#X] -Arguments: rangepartitioning(s_suppkey#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] +Arguments: rangepartitioning(s_suppkey#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [shuffle_writer_type=hash] (26) ShuffleQueryStage Output [5]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X, total_revenue#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark32/16.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark32/16.txt index 5f255fdaee88f..8b84f9db65999 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark32/16.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark32/16.txt @@ -123,7 +123,7 @@ Arguments: X, X (16) ColumnarExchange Input [5]: [hash_partition_key#X, p_brand#X, p_type#X, p_size#X, ps_suppkey#X] -Arguments: hashpartitioning(p_brand#X, p_type#X, p_size#X, ps_suppkey#X, 1), ENSURE_REQUIREMENTS, [p_brand#X, p_type#X, p_size#X, ps_suppkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(p_brand#X, p_type#X, p_size#X, ps_suppkey#X, 1), ENSURE_REQUIREMENTS, [p_brand#X, p_type#X, p_size#X, ps_suppkey#X], [plan_id=X], [shuffle_writer_type=hash] (17) ShuffleQueryStage Output [4]: [p_brand#X, p_type#X, p_size#X, ps_suppkey#X] @@ -163,7 +163,7 @@ Arguments: X, X (25) ColumnarExchange Input [5]: [hash_partition_key#X, p_brand#X, p_type#X, p_size#X, count#X] -Arguments: hashpartitioning(p_brand#X, p_type#X, p_size#X, 1), ENSURE_REQUIREMENTS, [p_brand#X, p_type#X, p_size#X, count#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(p_brand#X, p_type#X, p_size#X, 1), ENSURE_REQUIREMENTS, [p_brand#X, p_type#X, p_size#X, count#X], [plan_id=X], [shuffle_writer_type=hash] (26) ShuffleQueryStage Output [4]: [p_brand#X, p_type#X, p_size#X, count#X] @@ -192,7 +192,7 @@ Arguments: X, X (32) ColumnarExchange Input [4]: [p_brand#X, p_type#X, p_size#X, supplier_cnt#X] -Arguments: rangepartitioning(supplier_cnt#X DESC NULLS LAST, p_brand#X ASC NULLS FIRST, p_type#X ASC NULLS FIRST, p_size#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] +Arguments: rangepartitioning(supplier_cnt#X DESC NULLS LAST, p_brand#X ASC NULLS FIRST, p_type#X ASC NULLS FIRST, p_size#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [shuffle_writer_type=hash] (33) ShuffleQueryStage Output [4]: [p_brand#X, p_type#X, p_size#X, supplier_cnt#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark32/17.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark32/17.txt index 7f4e4e666c0d6..200674c0b1326 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark32/17.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark32/17.txt @@ -65,7 +65,7 @@ Arguments: X, X (8) ColumnarExchange Input [2]: [sum#X, isEmpty#X] -Arguments: SinglePartition, ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] +Arguments: SinglePartition, ENSURE_REQUIREMENTS, [plan_id=X], [shuffle_writer_type=hash] (9) ShuffleQueryStage Output [2]: [sum#X, isEmpty#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark32/18.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark32/18.txt index 6ee709d6b5a64..87714404df3ed 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark32/18.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark32/18.txt @@ -146,7 +146,7 @@ Arguments: X, X (15) ColumnarExchange Input [4]: [hash_partition_key#X, l_orderkey#X, sum#X, isEmpty#X] -Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, sum#X, isEmpty#X], [plan_id=X], [shuffle_writer_type=hash] (16) ShuffleQueryStage Output [3]: [l_orderkey#X, sum#X, isEmpty#X] @@ -282,7 +282,7 @@ Arguments: X, X (48) ColumnarExchange Input [8]: [hash_partition_key#X, c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum#X, isEmpty#X] -Arguments: hashpartitioning(c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, 1), ENSURE_REQUIREMENTS, [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, 1), ENSURE_REQUIREMENTS, [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum#X, isEmpty#X], [plan_id=X], [shuffle_writer_type=hash] (49) ShuffleQueryStage Output [7]: [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum#X, isEmpty#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark32/19.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark32/19.txt index 0555720ef0828..6ef3621c3a505 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark32/19.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark32/19.txt @@ -102,7 +102,7 @@ Arguments: X, X (16) ColumnarExchange Input [2]: [sum#X, isEmpty#X] -Arguments: SinglePartition, ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] +Arguments: SinglePartition, ENSURE_REQUIREMENTS, [plan_id=X], [shuffle_writer_type=hash] (17) ShuffleQueryStage Output [2]: [sum#X, isEmpty#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark32/20.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark32/20.txt index fef19a90199b2..ee99641661685 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark32/20.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark32/20.txt @@ -124,7 +124,7 @@ Arguments: X, X (6) ColumnarExchange Input [5]: [hash_partition_key#X, s_suppkey#X, s_name#X, s_address#X, s_nationkey#X] -Arguments: hashpartitioning(s_suppkey#X, 1), ENSURE_REQUIREMENTS, [s_suppkey#X, s_name#X, s_address#X, s_nationkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(s_suppkey#X, 1), ENSURE_REQUIREMENTS, [s_suppkey#X, s_name#X, s_address#X, s_nationkey#X], [plan_id=X], [shuffle_writer_type=hash] (7) ShuffleQueryStage Output [4]: [s_suppkey#X, s_name#X, s_address#X, s_nationkey#X] @@ -261,7 +261,7 @@ Arguments: X, X (39) ColumnarExchange Input [5]: [hash_partition_key#X, l_partkey#X, l_suppkey#X, sum#X, isEmpty#X] -Arguments: hashpartitioning(l_partkey#X, l_suppkey#X, 1), ENSURE_REQUIREMENTS, [l_partkey#X, l_suppkey#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(l_partkey#X, l_suppkey#X, 1), ENSURE_REQUIREMENTS, [l_partkey#X, l_suppkey#X, sum#X, isEmpty#X], [plan_id=X], [shuffle_writer_type=hash] (40) ShuffleQueryStage Output [4]: [l_partkey#X, l_suppkey#X, sum#X, isEmpty#X] @@ -376,7 +376,7 @@ Arguments: X, X (67) ColumnarExchange Input [2]: [s_name#X, s_address#X] -Arguments: rangepartitioning(s_name#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] +Arguments: rangepartitioning(s_name#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [shuffle_writer_type=hash] (68) ShuffleQueryStage Output [2]: [s_name#X, s_address#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark32/21.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark32/21.txt index b4158e426d5c1..4da38d933ee81 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark32/21.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark32/21.txt @@ -308,7 +308,7 @@ Arguments: X, X (53) ColumnarExchange Input [3]: [hash_partition_key#X, s_name#X, count#X] -Arguments: hashpartitioning(s_name#X, 1), ENSURE_REQUIREMENTS, [s_name#X, count#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(s_name#X, 1), ENSURE_REQUIREMENTS, [s_name#X, count#X], [plan_id=X], [shuffle_writer_type=hash] (54) ShuffleQueryStage Output [2]: [s_name#X, count#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark32/22.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark32/22.txt index 9db514620fa3c..cfe29976b36ac 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark32/22.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark32/22.txt @@ -101,7 +101,7 @@ Arguments: X, X (15) ColumnarExchange Input [5]: [hash_partition_key#X, cntrycode#X, count#X, sum#X, isEmpty#X] -Arguments: hashpartitioning(cntrycode#X, 1), ENSURE_REQUIREMENTS, [cntrycode#X, count#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(cntrycode#X, 1), ENSURE_REQUIREMENTS, [cntrycode#X, count#X, sum#X, isEmpty#X], [plan_id=X], [shuffle_writer_type=hash] (16) ShuffleQueryStage Output [4]: [cntrycode#X, count#X, sum#X, isEmpty#X] @@ -130,7 +130,7 @@ Arguments: X, X (22) ColumnarExchange Input [3]: [cntrycode#X, numcust#X, totacctbal#X] -Arguments: rangepartitioning(cntrycode#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] +Arguments: rangepartitioning(cntrycode#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [shuffle_writer_type=hash] (23) ShuffleQueryStage Output [3]: [cntrycode#X, numcust#X, totacctbal#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark32/3.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark32/3.txt index e9e3670db8427..eacdb0cccc598 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark32/3.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark32/3.txt @@ -166,7 +166,7 @@ Arguments: X, X (27) ColumnarExchange Input [6]: [hash_partition_key#X, l_orderkey#X, o_orderdate#X, o_shippriority#X, sum#X, isEmpty#X] -Arguments: hashpartitioning(l_orderkey#X, o_orderdate#X, o_shippriority#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, o_orderdate#X, o_shippriority#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(l_orderkey#X, o_orderdate#X, o_shippriority#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, o_orderdate#X, o_shippriority#X, sum#X, isEmpty#X], [plan_id=X], [shuffle_writer_type=hash] (28) ShuffleQueryStage Output [5]: [l_orderkey#X, o_orderdate#X, o_shippriority#X, sum#X, isEmpty#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark32/4.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark32/4.txt index 0195ac2721596..afc352febbc3a 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark32/4.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark32/4.txt @@ -120,7 +120,7 @@ Arguments: X, X (18) ColumnarExchange Input [3]: [hash_partition_key#X, o_orderpriority#X, count#X] -Arguments: hashpartitioning(o_orderpriority#X, 1), ENSURE_REQUIREMENTS, [o_orderpriority#X, count#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(o_orderpriority#X, 1), ENSURE_REQUIREMENTS, [o_orderpriority#X, count#X], [plan_id=X], [shuffle_writer_type=hash] (19) ShuffleQueryStage Output [2]: [o_orderpriority#X, count#X] @@ -149,7 +149,7 @@ Arguments: X, X (25) ColumnarExchange Input [2]: [o_orderpriority#X, order_count#X] -Arguments: rangepartitioning(o_orderpriority#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] +Arguments: rangepartitioning(o_orderpriority#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [shuffle_writer_type=hash] (26) ShuffleQueryStage Output [2]: [o_orderpriority#X, order_count#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark32/5.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark32/5.txt index 277d203a869c1..de7994a61b14b 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark32/5.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark32/5.txt @@ -320,7 +320,7 @@ Arguments: X, X (54) ColumnarExchange Input [4]: [hash_partition_key#X, n_name#X, sum#X, isEmpty#X] -Arguments: hashpartitioning(n_name#X, 1), ENSURE_REQUIREMENTS, [n_name#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(n_name#X, 1), ENSURE_REQUIREMENTS, [n_name#X, sum#X, isEmpty#X], [plan_id=X], [shuffle_writer_type=hash] (55) ShuffleQueryStage Output [3]: [n_name#X, sum#X, isEmpty#X] @@ -349,7 +349,7 @@ Arguments: X, X (61) ColumnarExchange Input [2]: [n_name#X, revenue#X] -Arguments: rangepartitioning(revenue#X DESC NULLS LAST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] +Arguments: rangepartitioning(revenue#X DESC NULLS LAST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [shuffle_writer_type=hash] (62) ShuffleQueryStage Output [2]: [n_name#X, revenue#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark32/6.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark32/6.txt index f30bcb0da8014..451b7abd33b43 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark32/6.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark32/6.txt @@ -52,7 +52,7 @@ Arguments: X, X (7) ColumnarExchange Input [2]: [sum#X, isEmpty#X] -Arguments: SinglePartition, ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] +Arguments: SinglePartition, ENSURE_REQUIREMENTS, [plan_id=X], [shuffle_writer_type=hash] (8) ShuffleQueryStage Output [2]: [sum#X, isEmpty#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark32/7.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark32/7.txt index 35de09c78d482..04e9c12e24838 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark32/7.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark32/7.txt @@ -290,7 +290,7 @@ Arguments: X, X (49) ColumnarExchange Input [6]: [hash_partition_key#X, supp_nation#X, cust_nation#X, l_year#X, sum#X, isEmpty#X] -Arguments: hashpartitioning(supp_nation#X, cust_nation#X, l_year#X, 1), ENSURE_REQUIREMENTS, [supp_nation#X, cust_nation#X, l_year#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(supp_nation#X, cust_nation#X, l_year#X, 1), ENSURE_REQUIREMENTS, [supp_nation#X, cust_nation#X, l_year#X, sum#X, isEmpty#X], [plan_id=X], [shuffle_writer_type=hash] (50) ShuffleQueryStage Output [5]: [supp_nation#X, cust_nation#X, l_year#X, sum#X, isEmpty#X] @@ -319,7 +319,7 @@ Arguments: X, X (56) ColumnarExchange Input [4]: [supp_nation#X, cust_nation#X, l_year#X, revenue#X] -Arguments: rangepartitioning(supp_nation#X ASC NULLS FIRST, cust_nation#X ASC NULLS FIRST, l_year#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] +Arguments: rangepartitioning(supp_nation#X ASC NULLS FIRST, cust_nation#X ASC NULLS FIRST, l_year#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [shuffle_writer_type=hash] (57) ShuffleQueryStage Output [4]: [supp_nation#X, cust_nation#X, l_year#X, revenue#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark32/8.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark32/8.txt index e7fd9789b9a3c..e8e36d54cc4ad 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark32/8.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark32/8.txt @@ -421,7 +421,7 @@ Arguments: X, X (72) ColumnarExchange Input [6]: [hash_partition_key#X, o_year#X, sum#X, isEmpty#X, sum#X, isEmpty#X] -Arguments: hashpartitioning(o_year#X, 1), ENSURE_REQUIREMENTS, [o_year#X, sum#X, isEmpty#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(o_year#X, 1), ENSURE_REQUIREMENTS, [o_year#X, sum#X, isEmpty#X, sum#X, isEmpty#X], [plan_id=X], [shuffle_writer_type=hash] (73) ShuffleQueryStage Output [5]: [o_year#X, sum#X, isEmpty#X, sum#X, isEmpty#X] @@ -454,7 +454,7 @@ Arguments: X, X (80) ColumnarExchange Input [2]: [o_year#X, mkt_share#X] -Arguments: rangepartitioning(o_year#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] +Arguments: rangepartitioning(o_year#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [shuffle_writer_type=hash] (81) ShuffleQueryStage Output [2]: [o_year#X, mkt_share#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark32/9.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark32/9.txt index 813cf616f41c8..a91e180bd632b 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark32/9.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark32/9.txt @@ -314,7 +314,7 @@ Arguments: X, X (53) ColumnarExchange Input [5]: [hash_partition_key#X, nation#X, o_year#X, sum#X, isEmpty#X] -Arguments: hashpartitioning(nation#X, o_year#X, 1), ENSURE_REQUIREMENTS, [nation#X, o_year#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(nation#X, o_year#X, 1), ENSURE_REQUIREMENTS, [nation#X, o_year#X, sum#X, isEmpty#X], [plan_id=X], [shuffle_writer_type=hash] (54) ShuffleQueryStage Output [4]: [nation#X, o_year#X, sum#X, isEmpty#X] @@ -343,7 +343,7 @@ Arguments: X, X (60) ColumnarExchange Input [3]: [nation#X, o_year#X, sum_profit#X] -Arguments: rangepartitioning(nation#X ASC NULLS FIRST, o_year#X DESC NULLS LAST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] +Arguments: rangepartitioning(nation#X ASC NULLS FIRST, o_year#X DESC NULLS LAST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [shuffle_writer_type=hash] (61) ShuffleQueryStage Output [3]: [nation#X, o_year#X, sum_profit#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark33/1.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark33/1.txt index 159a1598c9bf5..7e1dc79b38aa0 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark33/1.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark33/1.txt @@ -64,7 +64,7 @@ Arguments: X, X (8) ColumnarExchange Input [18]: [hash_partition_key#X, l_returnflag#X, l_linestatus#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, count#X, sum#X, count#X, sum#X, count#X, count#X] -Arguments: hashpartitioning(l_returnflag#X, l_linestatus#X, 1), ENSURE_REQUIREMENTS, [l_returnflag#X, l_linestatus#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, count#X, sum#X, count#X, sum#X, count#X, count#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(l_returnflag#X, l_linestatus#X, 1), ENSURE_REQUIREMENTS, [l_returnflag#X, l_linestatus#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, count#X, sum#X, count#X, sum#X, count#X, count#X], [plan_id=X], [shuffle_writer_type=hash] (9) ShuffleQueryStage Output [17]: [l_returnflag#X, l_linestatus#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, count#X, sum#X, count#X, sum#X, count#X, count#X] @@ -93,7 +93,7 @@ Arguments: X, X (15) ColumnarExchange Input [10]: [l_returnflag#X, l_linestatus#X, sum_qty#X, sum_base_price#X, sum_disc_price#X, sum_charge#X, avg_qty#X, avg_price#X, avg_disc#X, count_order#X] -Arguments: rangepartitioning(l_returnflag#X ASC NULLS FIRST, l_linestatus#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] +Arguments: rangepartitioning(l_returnflag#X ASC NULLS FIRST, l_linestatus#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [shuffle_writer_type=hash] (16) ShuffleQueryStage Output [10]: [l_returnflag#X, l_linestatus#X, sum_qty#X, sum_base_price#X, sum_disc_price#X, sum_charge#X, avg_qty#X, avg_price#X, avg_disc#X, count_order#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark33/10.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark33/10.txt index 2d91fcf6299e5..8b28d73f9f045 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark33/10.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark33/10.txt @@ -216,7 +216,7 @@ Arguments: X, X (36) ColumnarExchange Input [10]: [hash_partition_key#X, c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, sum#X, isEmpty#X] -Arguments: hashpartitioning(c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, sum#X, isEmpty#X], [plan_id=X], [shuffle_writer_type=hash] (37) ShuffleQueryStage Output [9]: [c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, sum#X, isEmpty#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark33/11.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark33/11.txt index b901a5db58150..0b96c3470133e 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark33/11.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark33/11.txt @@ -166,7 +166,7 @@ Arguments: X, X (26) ColumnarExchange Input [4]: [hash_partition_key#X, ps_partkey#X, sum#X, isEmpty#X] -Arguments: hashpartitioning(ps_partkey#X, 1), ENSURE_REQUIREMENTS, [ps_partkey#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(ps_partkey#X, 1), ENSURE_REQUIREMENTS, [ps_partkey#X, sum#X, isEmpty#X], [plan_id=X], [shuffle_writer_type=hash] (27) ShuffleQueryStage Output [3]: [ps_partkey#X, sum#X, isEmpty#X] @@ -199,7 +199,7 @@ Arguments: X, X (34) ColumnarExchange Input [2]: [ps_partkey#X, value#X] -Arguments: rangepartitioning(value#X DESC NULLS LAST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] +Arguments: rangepartitioning(value#X DESC NULLS LAST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [shuffle_writer_type=hash] (35) ShuffleQueryStage Output [2]: [ps_partkey#X, value#X] @@ -435,7 +435,7 @@ Arguments: X, X (78) ColumnarExchange Input [2]: [sum#X, isEmpty#X] -Arguments: SinglePartition, ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] +Arguments: SinglePartition, ENSURE_REQUIREMENTS, [plan_id=X], [shuffle_writer_type=hash] (79) ShuffleQueryStage Output [2]: [sum#X, isEmpty#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark33/12.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark33/12.txt index 456b95e5f5ded..663f4d6d8073d 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark33/12.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark33/12.txt @@ -114,7 +114,7 @@ Arguments: X, X (17) ColumnarExchange Input [4]: [hash_partition_key#X, l_shipmode#X, sum#X, sum#X] -Arguments: hashpartitioning(l_shipmode#X, 1), ENSURE_REQUIREMENTS, [l_shipmode#X, sum#X, sum#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(l_shipmode#X, 1), ENSURE_REQUIREMENTS, [l_shipmode#X, sum#X, sum#X], [plan_id=X], [shuffle_writer_type=hash] (18) ShuffleQueryStage Output [3]: [l_shipmode#X, sum#X, sum#X] @@ -143,7 +143,7 @@ Arguments: X, X (24) ColumnarExchange Input [3]: [l_shipmode#X, high_line_count#X, low_line_count#X] -Arguments: rangepartitioning(l_shipmode#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] +Arguments: rangepartitioning(l_shipmode#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [shuffle_writer_type=hash] (25) ShuffleQueryStage Output [3]: [l_shipmode#X, high_line_count#X, low_line_count#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark33/13.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark33/13.txt index 6044e9fa526c3..a51e79a7f4f58 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark33/13.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark33/13.txt @@ -118,7 +118,7 @@ Arguments: X, X (16) ColumnarExchange Input [3]: [hash_partition_key#X, c_custkey#X, count#X] -Arguments: hashpartitioning(c_custkey#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X, count#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(c_custkey#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X, count#X], [plan_id=X], [shuffle_writer_type=hash] (17) ShuffleQueryStage Output [2]: [c_custkey#X, count#X] @@ -162,7 +162,7 @@ Arguments: X, X (26) ColumnarExchange Input [3]: [hash_partition_key#X, c_count#X, count#X] -Arguments: hashpartitioning(c_count#X, 1), ENSURE_REQUIREMENTS, [c_count#X, count#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(c_count#X, 1), ENSURE_REQUIREMENTS, [c_count#X, count#X], [plan_id=X], [shuffle_writer_type=hash] (27) ShuffleQueryStage Output [2]: [c_count#X, count#X] @@ -191,7 +191,7 @@ Arguments: X, X (33) ColumnarExchange Input [2]: [c_count#X, custdist#X] -Arguments: rangepartitioning(custdist#X DESC NULLS LAST, c_count#X DESC NULLS LAST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] +Arguments: rangepartitioning(custdist#X DESC NULLS LAST, c_count#X DESC NULLS LAST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [shuffle_writer_type=hash] (34) ShuffleQueryStage Output [2]: [c_count#X, custdist#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark33/14.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark33/14.txt index 0f8e52cb2056e..d349e169adf72 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark33/14.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark33/14.txt @@ -103,7 +103,7 @@ Arguments: X, X (16) ColumnarExchange Input [4]: [sum#X, isEmpty#X, sum#X, isEmpty#X] -Arguments: SinglePartition, ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] +Arguments: SinglePartition, ENSURE_REQUIREMENTS, [plan_id=X], [shuffle_writer_type=hash] (17) ShuffleQueryStage Output [4]: [sum#X, isEmpty#X, sum#X, isEmpty#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark33/15.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark33/15.txt index dd611a9b29c8b..6320226094758 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark33/15.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark33/15.txt @@ -106,7 +106,7 @@ Arguments: X, X (15) ColumnarExchange Input [4]: [hash_partition_key#X, l_suppkey#X, sum#X, isEmpty#X] -Arguments: hashpartitioning(l_suppkey#X, 1), ENSURE_REQUIREMENTS, [l_suppkey#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(l_suppkey#X, 1), ENSURE_REQUIREMENTS, [l_suppkey#X, sum#X, isEmpty#X], [plan_id=X], [shuffle_writer_type=hash] (16) ShuffleQueryStage Output [3]: [l_suppkey#X, sum#X, isEmpty#X] @@ -148,7 +148,7 @@ Arguments: X, X (25) ColumnarExchange Input [5]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X, total_revenue#X] -Arguments: rangepartitioning(s_suppkey#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] +Arguments: rangepartitioning(s_suppkey#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [shuffle_writer_type=hash] (26) ShuffleQueryStage Output [5]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X, total_revenue#X] @@ -300,7 +300,7 @@ Arguments: X, X (51) ColumnarExchange Input [4]: [hash_partition_key#X, l_suppkey#X, sum#X, isEmpty#X] -Arguments: hashpartitioning(l_suppkey#X, 1), ENSURE_REQUIREMENTS, [l_suppkey#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(l_suppkey#X, 1), ENSURE_REQUIREMENTS, [l_suppkey#X, sum#X, isEmpty#X], [plan_id=X], [shuffle_writer_type=hash] (52) ShuffleQueryStage Output [3]: [l_suppkey#X, sum#X, isEmpty#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark33/16.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark33/16.txt index 13aa5f68019e0..0dd714b700982 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark33/16.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark33/16.txt @@ -123,7 +123,7 @@ Arguments: X, X (16) ColumnarExchange Input [5]: [hash_partition_key#X, p_brand#X, p_type#X, p_size#X, ps_suppkey#X] -Arguments: hashpartitioning(p_brand#X, p_type#X, p_size#X, ps_suppkey#X, 1), ENSURE_REQUIREMENTS, [p_brand#X, p_type#X, p_size#X, ps_suppkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(p_brand#X, p_type#X, p_size#X, ps_suppkey#X, 1), ENSURE_REQUIREMENTS, [p_brand#X, p_type#X, p_size#X, ps_suppkey#X], [plan_id=X], [shuffle_writer_type=hash] (17) ShuffleQueryStage Output [4]: [p_brand#X, p_type#X, p_size#X, ps_suppkey#X] @@ -163,7 +163,7 @@ Arguments: X, X (25) ColumnarExchange Input [5]: [hash_partition_key#X, p_brand#X, p_type#X, p_size#X, count#X] -Arguments: hashpartitioning(p_brand#X, p_type#X, p_size#X, 1), ENSURE_REQUIREMENTS, [p_brand#X, p_type#X, p_size#X, count#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(p_brand#X, p_type#X, p_size#X, 1), ENSURE_REQUIREMENTS, [p_brand#X, p_type#X, p_size#X, count#X], [plan_id=X], [shuffle_writer_type=hash] (26) ShuffleQueryStage Output [4]: [p_brand#X, p_type#X, p_size#X, count#X] @@ -192,7 +192,7 @@ Arguments: X, X (32) ColumnarExchange Input [4]: [p_brand#X, p_type#X, p_size#X, supplier_cnt#X] -Arguments: rangepartitioning(supplier_cnt#X DESC NULLS LAST, p_brand#X ASC NULLS FIRST, p_type#X ASC NULLS FIRST, p_size#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] +Arguments: rangepartitioning(supplier_cnt#X DESC NULLS LAST, p_brand#X ASC NULLS FIRST, p_type#X ASC NULLS FIRST, p_size#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [shuffle_writer_type=hash] (33) ShuffleQueryStage Output [4]: [p_brand#X, p_type#X, p_size#X, supplier_cnt#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark33/17.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark33/17.txt index 38801e9f690a8..d0d18bdf816a8 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark33/17.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark33/17.txt @@ -65,7 +65,7 @@ Arguments: X, X (8) ColumnarExchange Input [2]: [sum#X, isEmpty#X] -Arguments: SinglePartition, ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] +Arguments: SinglePartition, ENSURE_REQUIREMENTS, [plan_id=X], [shuffle_writer_type=hash] (9) ShuffleQueryStage Output [2]: [sum#X, isEmpty#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark33/18.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark33/18.txt index 62a2f7b0b48aa..ea324851532c4 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark33/18.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark33/18.txt @@ -146,7 +146,7 @@ Arguments: X, X (15) ColumnarExchange Input [4]: [hash_partition_key#X, l_orderkey#X, sum#X, isEmpty#X] -Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, sum#X, isEmpty#X], [plan_id=X], [shuffle_writer_type=hash] (16) ShuffleQueryStage Output [3]: [l_orderkey#X, sum#X, isEmpty#X] @@ -282,7 +282,7 @@ Arguments: X, X (48) ColumnarExchange Input [8]: [hash_partition_key#X, c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum#X, isEmpty#X] -Arguments: hashpartitioning(c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, 1), ENSURE_REQUIREMENTS, [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, 1), ENSURE_REQUIREMENTS, [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum#X, isEmpty#X], [plan_id=X], [shuffle_writer_type=hash] (49) ShuffleQueryStage Output [7]: [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum#X, isEmpty#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark33/19.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark33/19.txt index 178fa021fb70e..fa391e426cd03 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark33/19.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark33/19.txt @@ -102,7 +102,7 @@ Arguments: X, X (16) ColumnarExchange Input [2]: [sum#X, isEmpty#X] -Arguments: SinglePartition, ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] +Arguments: SinglePartition, ENSURE_REQUIREMENTS, [plan_id=X], [shuffle_writer_type=hash] (17) ShuffleQueryStage Output [2]: [sum#X, isEmpty#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark33/20.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark33/20.txt index 62a0f1a317b11..9bfc87c2bd223 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark33/20.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark33/20.txt @@ -123,7 +123,7 @@ Arguments: X, X (6) ColumnarExchange Input [5]: [hash_partition_key#X, s_suppkey#X, s_name#X, s_address#X, s_nationkey#X] -Arguments: hashpartitioning(s_suppkey#X, 1), ENSURE_REQUIREMENTS, [s_suppkey#X, s_name#X, s_address#X, s_nationkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(s_suppkey#X, 1), ENSURE_REQUIREMENTS, [s_suppkey#X, s_name#X, s_address#X, s_nationkey#X], [plan_id=X], [shuffle_writer_type=hash] (7) ShuffleQueryStage Output [4]: [s_suppkey#X, s_name#X, s_address#X, s_nationkey#X] @@ -260,7 +260,7 @@ Arguments: X, X (39) ColumnarExchange Input [5]: [hash_partition_key#X, l_partkey#X, l_suppkey#X, sum#X, isEmpty#X] -Arguments: hashpartitioning(l_partkey#X, l_suppkey#X, 1), ENSURE_REQUIREMENTS, [l_partkey#X, l_suppkey#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(l_partkey#X, l_suppkey#X, 1), ENSURE_REQUIREMENTS, [l_partkey#X, l_suppkey#X, sum#X, isEmpty#X], [plan_id=X], [shuffle_writer_type=hash] (40) ShuffleQueryStage Output [4]: [l_partkey#X, l_suppkey#X, sum#X, isEmpty#X] @@ -375,7 +375,7 @@ Arguments: X, X (67) ColumnarExchange Input [2]: [s_name#X, s_address#X] -Arguments: rangepartitioning(s_name#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] +Arguments: rangepartitioning(s_name#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [shuffle_writer_type=hash] (68) ShuffleQueryStage Output [2]: [s_name#X, s_address#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark33/21.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark33/21.txt index 045b4a38e3a3b..0ab2c88fdef9e 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark33/21.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark33/21.txt @@ -307,7 +307,7 @@ Arguments: X, X (53) ColumnarExchange Input [3]: [hash_partition_key#X, s_name#X, count#X] -Arguments: hashpartitioning(s_name#X, 1), ENSURE_REQUIREMENTS, [s_name#X, count#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(s_name#X, 1), ENSURE_REQUIREMENTS, [s_name#X, count#X], [plan_id=X], [shuffle_writer_type=hash] (54) ShuffleQueryStage Output [2]: [s_name#X, count#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark33/22.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark33/22.txt index 829be008d8436..44d408a45c820 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark33/22.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark33/22.txt @@ -101,7 +101,7 @@ Arguments: X, X (15) ColumnarExchange Input [5]: [hash_partition_key#X, cntrycode#X, count#X, sum#X, isEmpty#X] -Arguments: hashpartitioning(cntrycode#X, 1), ENSURE_REQUIREMENTS, [cntrycode#X, count#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(cntrycode#X, 1), ENSURE_REQUIREMENTS, [cntrycode#X, count#X, sum#X, isEmpty#X], [plan_id=X], [shuffle_writer_type=hash] (16) ShuffleQueryStage Output [4]: [cntrycode#X, count#X, sum#X, isEmpty#X] @@ -130,7 +130,7 @@ Arguments: X, X (22) ColumnarExchange Input [3]: [cntrycode#X, numcust#X, totacctbal#X] -Arguments: rangepartitioning(cntrycode#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] +Arguments: rangepartitioning(cntrycode#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [shuffle_writer_type=hash] (23) ShuffleQueryStage Output [3]: [cntrycode#X, numcust#X, totacctbal#X] @@ -269,7 +269,7 @@ Arguments: X, X (47) ColumnarExchange Input [2]: [sum#X, count#X] -Arguments: SinglePartition, ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] +Arguments: SinglePartition, ENSURE_REQUIREMENTS, [plan_id=X], [shuffle_writer_type=hash] (48) ShuffleQueryStage Output [2]: [sum#X, count#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark33/3.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark33/3.txt index 5ee5820d32d68..cab0ccc1015ac 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark33/3.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark33/3.txt @@ -166,7 +166,7 @@ Arguments: X, X (27) ColumnarExchange Input [6]: [hash_partition_key#X, l_orderkey#X, o_orderdate#X, o_shippriority#X, sum#X, isEmpty#X] -Arguments: hashpartitioning(l_orderkey#X, o_orderdate#X, o_shippriority#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, o_orderdate#X, o_shippriority#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(l_orderkey#X, o_orderdate#X, o_shippriority#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, o_orderdate#X, o_shippriority#X, sum#X, isEmpty#X], [plan_id=X], [shuffle_writer_type=hash] (28) ShuffleQueryStage Output [5]: [l_orderkey#X, o_orderdate#X, o_shippriority#X, sum#X, isEmpty#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark33/4.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark33/4.txt index b9eb997a0cbbb..f4eaf3e8dc4a8 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark33/4.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark33/4.txt @@ -120,7 +120,7 @@ Arguments: X, X (18) ColumnarExchange Input [3]: [hash_partition_key#X, o_orderpriority#X, count#X] -Arguments: hashpartitioning(o_orderpriority#X, 1), ENSURE_REQUIREMENTS, [o_orderpriority#X, count#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(o_orderpriority#X, 1), ENSURE_REQUIREMENTS, [o_orderpriority#X, count#X], [plan_id=X], [shuffle_writer_type=hash] (19) ShuffleQueryStage Output [2]: [o_orderpriority#X, count#X] @@ -149,7 +149,7 @@ Arguments: X, X (25) ColumnarExchange Input [2]: [o_orderpriority#X, order_count#X] -Arguments: rangepartitioning(o_orderpriority#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] +Arguments: rangepartitioning(o_orderpriority#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [shuffle_writer_type=hash] (26) ShuffleQueryStage Output [2]: [o_orderpriority#X, order_count#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark33/5.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark33/5.txt index a0d3ca93f02b1..209fc2b2544c0 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark33/5.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark33/5.txt @@ -320,7 +320,7 @@ Arguments: X, X (54) ColumnarExchange Input [4]: [hash_partition_key#X, n_name#X, sum#X, isEmpty#X] -Arguments: hashpartitioning(n_name#X, 1), ENSURE_REQUIREMENTS, [n_name#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(n_name#X, 1), ENSURE_REQUIREMENTS, [n_name#X, sum#X, isEmpty#X], [plan_id=X], [shuffle_writer_type=hash] (55) ShuffleQueryStage Output [3]: [n_name#X, sum#X, isEmpty#X] @@ -349,7 +349,7 @@ Arguments: X, X (61) ColumnarExchange Input [2]: [n_name#X, revenue#X] -Arguments: rangepartitioning(revenue#X DESC NULLS LAST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] +Arguments: rangepartitioning(revenue#X DESC NULLS LAST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [shuffle_writer_type=hash] (62) ShuffleQueryStage Output [2]: [n_name#X, revenue#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark33/6.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark33/6.txt index 1aaed506d7e08..ee04d2159020d 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark33/6.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark33/6.txt @@ -52,7 +52,7 @@ Arguments: X, X (7) ColumnarExchange Input [2]: [sum#X, isEmpty#X] -Arguments: SinglePartition, ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] +Arguments: SinglePartition, ENSURE_REQUIREMENTS, [plan_id=X], [shuffle_writer_type=hash] (8) ShuffleQueryStage Output [2]: [sum#X, isEmpty#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark33/7.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark33/7.txt index 8d95ec5e6f748..109de659cb1f7 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark33/7.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark33/7.txt @@ -290,7 +290,7 @@ Arguments: X, X (49) ColumnarExchange Input [6]: [hash_partition_key#X, supp_nation#X, cust_nation#X, l_year#X, sum#X, isEmpty#X] -Arguments: hashpartitioning(supp_nation#X, cust_nation#X, l_year#X, 1), ENSURE_REQUIREMENTS, [supp_nation#X, cust_nation#X, l_year#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(supp_nation#X, cust_nation#X, l_year#X, 1), ENSURE_REQUIREMENTS, [supp_nation#X, cust_nation#X, l_year#X, sum#X, isEmpty#X], [plan_id=X], [shuffle_writer_type=hash] (50) ShuffleQueryStage Output [5]: [supp_nation#X, cust_nation#X, l_year#X, sum#X, isEmpty#X] @@ -319,7 +319,7 @@ Arguments: X, X (56) ColumnarExchange Input [4]: [supp_nation#X, cust_nation#X, l_year#X, revenue#X] -Arguments: rangepartitioning(supp_nation#X ASC NULLS FIRST, cust_nation#X ASC NULLS FIRST, l_year#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] +Arguments: rangepartitioning(supp_nation#X ASC NULLS FIRST, cust_nation#X ASC NULLS FIRST, l_year#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [shuffle_writer_type=hash] (57) ShuffleQueryStage Output [4]: [supp_nation#X, cust_nation#X, l_year#X, revenue#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark33/8.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark33/8.txt index b74dc65358d2e..709642967edda 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark33/8.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark33/8.txt @@ -421,7 +421,7 @@ Arguments: X, X (72) ColumnarExchange Input [6]: [hash_partition_key#X, o_year#X, sum#X, isEmpty#X, sum#X, isEmpty#X] -Arguments: hashpartitioning(o_year#X, 1), ENSURE_REQUIREMENTS, [o_year#X, sum#X, isEmpty#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(o_year#X, 1), ENSURE_REQUIREMENTS, [o_year#X, sum#X, isEmpty#X, sum#X, isEmpty#X], [plan_id=X], [shuffle_writer_type=hash] (73) ShuffleQueryStage Output [5]: [o_year#X, sum#X, isEmpty#X, sum#X, isEmpty#X] @@ -454,7 +454,7 @@ Arguments: X, X (80) ColumnarExchange Input [2]: [o_year#X, mkt_share#X] -Arguments: rangepartitioning(o_year#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] +Arguments: rangepartitioning(o_year#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [shuffle_writer_type=hash] (81) ShuffleQueryStage Output [2]: [o_year#X, mkt_share#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark33/9.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark33/9.txt index 400fb12e1deaf..3b57f40101e8e 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark33/9.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark33/9.txt @@ -314,7 +314,7 @@ Arguments: X, X (53) ColumnarExchange Input [5]: [hash_partition_key#X, nation#X, o_year#X, sum#X, isEmpty#X] -Arguments: hashpartitioning(nation#X, o_year#X, 1), ENSURE_REQUIREMENTS, [nation#X, o_year#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(nation#X, o_year#X, 1), ENSURE_REQUIREMENTS, [nation#X, o_year#X, sum#X, isEmpty#X], [plan_id=X], [shuffle_writer_type=hash] (54) ShuffleQueryStage Output [4]: [nation#X, o_year#X, sum#X, isEmpty#X] @@ -343,7 +343,7 @@ Arguments: X, X (60) ColumnarExchange Input [3]: [nation#X, o_year#X, sum_profit#X] -Arguments: rangepartitioning(nation#X ASC NULLS FIRST, o_year#X DESC NULLS LAST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] +Arguments: rangepartitioning(nation#X ASC NULLS FIRST, o_year#X DESC NULLS LAST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [shuffle_writer_type=hash] (61) ShuffleQueryStage Output [3]: [nation#X, o_year#X, sum_profit#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark34/1.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark34/1.txt index 2db104cfeb12e..03cab493a6c4a 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark34/1.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark34/1.txt @@ -64,7 +64,7 @@ Arguments: X, X (8) ColumnarExchange Input [18]: [hash_partition_key#X, l_returnflag#X, l_linestatus#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, count#X, sum#X, count#X, sum#X, count#X, count#X] -Arguments: hashpartitioning(l_returnflag#X, l_linestatus#X, 1), ENSURE_REQUIREMENTS, [l_returnflag#X, l_linestatus#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, count#X, sum#X, count#X, sum#X, count#X, count#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(l_returnflag#X, l_linestatus#X, 1), ENSURE_REQUIREMENTS, [l_returnflag#X, l_linestatus#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, count#X, sum#X, count#X, sum#X, count#X, count#X], [plan_id=X], [shuffle_writer_type=hash] (9) ShuffleQueryStage Output [17]: [l_returnflag#X, l_linestatus#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, count#X, sum#X, count#X, sum#X, count#X, count#X] @@ -93,7 +93,7 @@ Arguments: X, X (15) ColumnarExchange Input [10]: [l_returnflag#X, l_linestatus#X, sum_qty#X, sum_base_price#X, sum_disc_price#X, sum_charge#X, avg_qty#X, avg_price#X, avg_disc#X, count_order#X] -Arguments: rangepartitioning(l_returnflag#X ASC NULLS FIRST, l_linestatus#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] +Arguments: rangepartitioning(l_returnflag#X ASC NULLS FIRST, l_linestatus#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [shuffle_writer_type=hash] (16) ShuffleQueryStage Output [10]: [l_returnflag#X, l_linestatus#X, sum_qty#X, sum_base_price#X, sum_disc_price#X, sum_charge#X, avg_qty#X, avg_price#X, avg_disc#X, count_order#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark34/10.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark34/10.txt index 4c263cb4a280d..81f3b18b6506e 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark34/10.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark34/10.txt @@ -219,7 +219,7 @@ Arguments: X, X (36) ColumnarExchange Input [10]: [hash_partition_key#X, c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, sum#X, isEmpty#X] -Arguments: hashpartitioning(c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, sum#X, isEmpty#X], [plan_id=X], [shuffle_writer_type=hash] (37) ShuffleQueryStage Output [9]: [c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, sum#X, isEmpty#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark34/11.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark34/11.txt index 83e60d925e400..d4e2cf19c8586 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark34/11.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark34/11.txt @@ -168,7 +168,7 @@ Arguments: X, X (26) ColumnarExchange Input [4]: [hash_partition_key#X, ps_partkey#X, sum#X, isEmpty#X] -Arguments: hashpartitioning(ps_partkey#X, 1), ENSURE_REQUIREMENTS, [ps_partkey#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(ps_partkey#X, 1), ENSURE_REQUIREMENTS, [ps_partkey#X, sum#X, isEmpty#X], [plan_id=X], [shuffle_writer_type=hash] (27) ShuffleQueryStage Output [3]: [ps_partkey#X, sum#X, isEmpty#X] @@ -201,7 +201,7 @@ Arguments: X, X (34) ColumnarExchange Input [2]: [ps_partkey#X, value#X] -Arguments: rangepartitioning(value#X DESC NULLS LAST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] +Arguments: rangepartitioning(value#X DESC NULLS LAST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [shuffle_writer_type=hash] (35) ShuffleQueryStage Output [2]: [ps_partkey#X, value#X] @@ -441,7 +441,7 @@ Arguments: X, X (78) ColumnarExchange Input [2]: [sum#X, isEmpty#X] -Arguments: SinglePartition, ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] +Arguments: SinglePartition, ENSURE_REQUIREMENTS, [plan_id=X], [shuffle_writer_type=hash] (79) ShuffleQueryStage Output [2]: [sum#X, isEmpty#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark34/12.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark34/12.txt index 354e97da2066a..c2f8ff7889555 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark34/12.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark34/12.txt @@ -115,7 +115,7 @@ Arguments: X, X (17) ColumnarExchange Input [4]: [hash_partition_key#X, l_shipmode#X, sum#X, sum#X] -Arguments: hashpartitioning(l_shipmode#X, 1), ENSURE_REQUIREMENTS, [l_shipmode#X, sum#X, sum#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(l_shipmode#X, 1), ENSURE_REQUIREMENTS, [l_shipmode#X, sum#X, sum#X], [plan_id=X], [shuffle_writer_type=hash] (18) ShuffleQueryStage Output [3]: [l_shipmode#X, sum#X, sum#X] @@ -144,7 +144,7 @@ Arguments: X, X (24) ColumnarExchange Input [3]: [l_shipmode#X, high_line_count#X, low_line_count#X] -Arguments: rangepartitioning(l_shipmode#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] +Arguments: rangepartitioning(l_shipmode#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [shuffle_writer_type=hash] (25) ShuffleQueryStage Output [3]: [l_shipmode#X, high_line_count#X, low_line_count#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark34/13.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark34/13.txt index 614098d876a41..0636592d586e2 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark34/13.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark34/13.txt @@ -119,7 +119,7 @@ Arguments: X, X (16) ColumnarExchange Input [3]: [hash_partition_key#X, c_custkey#X, count#X] -Arguments: hashpartitioning(c_custkey#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X, count#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(c_custkey#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X, count#X], [plan_id=X], [shuffle_writer_type=hash] (17) ShuffleQueryStage Output [2]: [c_custkey#X, count#X] @@ -163,7 +163,7 @@ Arguments: X, X (26) ColumnarExchange Input [3]: [hash_partition_key#X, c_count#X, count#X] -Arguments: hashpartitioning(c_count#X, 1), ENSURE_REQUIREMENTS, [c_count#X, count#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(c_count#X, 1), ENSURE_REQUIREMENTS, [c_count#X, count#X], [plan_id=X], [shuffle_writer_type=hash] (27) ShuffleQueryStage Output [2]: [c_count#X, count#X] @@ -192,7 +192,7 @@ Arguments: X, X (33) ColumnarExchange Input [2]: [c_count#X, custdist#X] -Arguments: rangepartitioning(custdist#X DESC NULLS LAST, c_count#X DESC NULLS LAST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] +Arguments: rangepartitioning(custdist#X DESC NULLS LAST, c_count#X DESC NULLS LAST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [shuffle_writer_type=hash] (34) ShuffleQueryStage Output [2]: [c_count#X, custdist#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark34/14.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark34/14.txt index df48e66cf0ab1..cd820d5b30638 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark34/14.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark34/14.txt @@ -104,7 +104,7 @@ Arguments: X, X (16) ColumnarExchange Input [4]: [sum#X, isEmpty#X, sum#X, isEmpty#X] -Arguments: SinglePartition, ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] +Arguments: SinglePartition, ENSURE_REQUIREMENTS, [plan_id=X], [shuffle_writer_type=hash] (17) ShuffleQueryStage Output [4]: [sum#X, isEmpty#X, sum#X, isEmpty#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark34/15.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark34/15.txt index dfdcc6a1d8c76..f3e19f99de05e 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark34/15.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark34/15.txt @@ -106,7 +106,7 @@ Arguments: X, X (15) ColumnarExchange Input [4]: [hash_partition_key#X, l_suppkey#X, sum#X, isEmpty#X] -Arguments: hashpartitioning(l_suppkey#X, 1), ENSURE_REQUIREMENTS, [l_suppkey#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(l_suppkey#X, 1), ENSURE_REQUIREMENTS, [l_suppkey#X, sum#X, isEmpty#X], [plan_id=X], [shuffle_writer_type=hash] (16) ShuffleQueryStage Output [3]: [l_suppkey#X, sum#X, isEmpty#X] @@ -149,7 +149,7 @@ Arguments: X, X (25) ColumnarExchange Input [5]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X, total_revenue#X] -Arguments: rangepartitioning(s_suppkey#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] +Arguments: rangepartitioning(s_suppkey#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [shuffle_writer_type=hash] (26) ShuffleQueryStage Output [5]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X, total_revenue#X] @@ -302,7 +302,7 @@ Arguments: X, X (51) ColumnarExchange Input [4]: [hash_partition_key#X, l_suppkey#X, sum#X, isEmpty#X] -Arguments: hashpartitioning(l_suppkey#X, 1), ENSURE_REQUIREMENTS, [l_suppkey#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(l_suppkey#X, 1), ENSURE_REQUIREMENTS, [l_suppkey#X, sum#X, isEmpty#X], [plan_id=X], [shuffle_writer_type=hash] (52) ShuffleQueryStage Output [3]: [l_suppkey#X, sum#X, isEmpty#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark34/16.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark34/16.txt index 73bf50f8f6f06..f05bc50df0b2d 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark34/16.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark34/16.txt @@ -124,7 +124,7 @@ Arguments: X, X (16) ColumnarExchange Input [5]: [hash_partition_key#X, p_brand#X, p_type#X, p_size#X, ps_suppkey#X] -Arguments: hashpartitioning(p_brand#X, p_type#X, p_size#X, ps_suppkey#X, 1), ENSURE_REQUIREMENTS, [p_brand#X, p_type#X, p_size#X, ps_suppkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(p_brand#X, p_type#X, p_size#X, ps_suppkey#X, 1), ENSURE_REQUIREMENTS, [p_brand#X, p_type#X, p_size#X, ps_suppkey#X], [plan_id=X], [shuffle_writer_type=hash] (17) ShuffleQueryStage Output [4]: [p_brand#X, p_type#X, p_size#X, ps_suppkey#X] @@ -164,7 +164,7 @@ Arguments: X, X (25) ColumnarExchange Input [5]: [hash_partition_key#X, p_brand#X, p_type#X, p_size#X, count#X] -Arguments: hashpartitioning(p_brand#X, p_type#X, p_size#X, 1), ENSURE_REQUIREMENTS, [p_brand#X, p_type#X, p_size#X, count#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(p_brand#X, p_type#X, p_size#X, 1), ENSURE_REQUIREMENTS, [p_brand#X, p_type#X, p_size#X, count#X], [plan_id=X], [shuffle_writer_type=hash] (26) ShuffleQueryStage Output [4]: [p_brand#X, p_type#X, p_size#X, count#X] @@ -193,7 +193,7 @@ Arguments: X, X (32) ColumnarExchange Input [4]: [p_brand#X, p_type#X, p_size#X, supplier_cnt#X] -Arguments: rangepartitioning(supplier_cnt#X DESC NULLS LAST, p_brand#X ASC NULLS FIRST, p_type#X ASC NULLS FIRST, p_size#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] +Arguments: rangepartitioning(supplier_cnt#X DESC NULLS LAST, p_brand#X ASC NULLS FIRST, p_type#X ASC NULLS FIRST, p_size#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [shuffle_writer_type=hash] (33) ShuffleQueryStage Output [4]: [p_brand#X, p_type#X, p_size#X, supplier_cnt#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark34/17.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark34/17.txt index 77c18798faa24..f2b7262c75969 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark34/17.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark34/17.txt @@ -65,7 +65,7 @@ Arguments: X, X (8) ColumnarExchange Input [2]: [sum#X, isEmpty#X] -Arguments: SinglePartition, ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] +Arguments: SinglePartition, ENSURE_REQUIREMENTS, [plan_id=X], [shuffle_writer_type=hash] (9) ShuffleQueryStage Output [2]: [sum#X, isEmpty#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark34/18.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark34/18.txt index db32a9be29f10..1ee0d23b567f2 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark34/18.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark34/18.txt @@ -146,7 +146,7 @@ Arguments: X, X (15) ColumnarExchange Input [4]: [hash_partition_key#X, l_orderkey#X, sum#X, isEmpty#X] -Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, sum#X, isEmpty#X], [plan_id=X], [shuffle_writer_type=hash] (16) ShuffleQueryStage Output [3]: [l_orderkey#X, sum#X, isEmpty#X] @@ -286,7 +286,7 @@ Arguments: X, X (48) ColumnarExchange Input [8]: [hash_partition_key#X, c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum#X, isEmpty#X] -Arguments: hashpartitioning(c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, 1), ENSURE_REQUIREMENTS, [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, 1), ENSURE_REQUIREMENTS, [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum#X, isEmpty#X], [plan_id=X], [shuffle_writer_type=hash] (49) ShuffleQueryStage Output [7]: [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum#X, isEmpty#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark34/19.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark34/19.txt index 0a48f3fd81173..1babbc07b296c 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark34/19.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark34/19.txt @@ -103,7 +103,7 @@ Arguments: X, X (16) ColumnarExchange Input [2]: [sum#X, isEmpty#X] -Arguments: SinglePartition, ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] +Arguments: SinglePartition, ENSURE_REQUIREMENTS, [plan_id=X], [shuffle_writer_type=hash] (17) ShuffleQueryStage Output [2]: [sum#X, isEmpty#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark34/20.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark34/20.txt index d54ea7dc1c138..5d6c218366b6b 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark34/20.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark34/20.txt @@ -223,7 +223,7 @@ Arguments: X, X (31) ColumnarExchange Input [5]: [hash_partition_key#X, l_partkey#X, l_suppkey#X, sum#X, isEmpty#X] -Arguments: hashpartitioning(l_partkey#X, l_suppkey#X, 1), ENSURE_REQUIREMENTS, [l_partkey#X, l_suppkey#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(l_partkey#X, l_suppkey#X, 1), ENSURE_REQUIREMENTS, [l_partkey#X, l_suppkey#X, sum#X, isEmpty#X], [plan_id=X], [shuffle_writer_type=hash] (32) ShuffleQueryStage Output [4]: [l_partkey#X, l_suppkey#X, sum#X, isEmpty#X] @@ -341,7 +341,7 @@ Arguments: X, X (59) ColumnarExchange Input [2]: [s_name#X, s_address#X] -Arguments: rangepartitioning(s_name#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] +Arguments: rangepartitioning(s_name#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [shuffle_writer_type=hash] (60) ShuffleQueryStage Output [2]: [s_name#X, s_address#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark34/21.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark34/21.txt index 162a7db4d2127..0b4fa0af4b7d7 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark34/21.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark34/21.txt @@ -312,7 +312,7 @@ Arguments: X, X (53) ColumnarExchange Input [3]: [hash_partition_key#X, s_name#X, count#X] -Arguments: hashpartitioning(s_name#X, 1), ENSURE_REQUIREMENTS, [s_name#X, count#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(s_name#X, 1), ENSURE_REQUIREMENTS, [s_name#X, count#X], [plan_id=X], [shuffle_writer_type=hash] (54) ShuffleQueryStage Output [2]: [s_name#X, count#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark34/22.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark34/22.txt index b1ed2c89b6eb7..161ac0ada7ec3 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark34/22.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark34/22.txt @@ -102,7 +102,7 @@ Arguments: X, X (15) ColumnarExchange Input [5]: [hash_partition_key#X, cntrycode#X, count#X, sum#X, isEmpty#X] -Arguments: hashpartitioning(cntrycode#X, 1), ENSURE_REQUIREMENTS, [cntrycode#X, count#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(cntrycode#X, 1), ENSURE_REQUIREMENTS, [cntrycode#X, count#X, sum#X, isEmpty#X], [plan_id=X], [shuffle_writer_type=hash] (16) ShuffleQueryStage Output [4]: [cntrycode#X, count#X, sum#X, isEmpty#X] @@ -131,7 +131,7 @@ Arguments: X, X (22) ColumnarExchange Input [3]: [cntrycode#X, numcust#X, totacctbal#X] -Arguments: rangepartitioning(cntrycode#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] +Arguments: rangepartitioning(cntrycode#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [shuffle_writer_type=hash] (23) ShuffleQueryStage Output [3]: [cntrycode#X, numcust#X, totacctbal#X] @@ -271,7 +271,7 @@ Arguments: X, X (47) ColumnarExchange Input [2]: [sum#X, count#X] -Arguments: SinglePartition, ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] +Arguments: SinglePartition, ENSURE_REQUIREMENTS, [plan_id=X], [shuffle_writer_type=hash] (48) ShuffleQueryStage Output [2]: [sum#X, count#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark34/3.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark34/3.txt index 9673efca9bd60..c247160f67924 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark34/3.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark34/3.txt @@ -168,7 +168,7 @@ Arguments: X, X (27) ColumnarExchange Input [6]: [hash_partition_key#X, l_orderkey#X, o_orderdate#X, o_shippriority#X, sum#X, isEmpty#X] -Arguments: hashpartitioning(l_orderkey#X, o_orderdate#X, o_shippriority#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, o_orderdate#X, o_shippriority#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(l_orderkey#X, o_orderdate#X, o_shippriority#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, o_orderdate#X, o_shippriority#X, sum#X, isEmpty#X], [plan_id=X], [shuffle_writer_type=hash] (28) ShuffleQueryStage Output [5]: [l_orderkey#X, o_orderdate#X, o_shippriority#X, sum#X, isEmpty#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark34/4.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark34/4.txt index ce4180123f4c0..f4c04d19e78d6 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark34/4.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark34/4.txt @@ -121,7 +121,7 @@ Arguments: X, X (18) ColumnarExchange Input [3]: [hash_partition_key#X, o_orderpriority#X, count#X] -Arguments: hashpartitioning(o_orderpriority#X, 1), ENSURE_REQUIREMENTS, [o_orderpriority#X, count#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(o_orderpriority#X, 1), ENSURE_REQUIREMENTS, [o_orderpriority#X, count#X], [plan_id=X], [shuffle_writer_type=hash] (19) ShuffleQueryStage Output [2]: [o_orderpriority#X, count#X] @@ -150,7 +150,7 @@ Arguments: X, X (25) ColumnarExchange Input [2]: [o_orderpriority#X, order_count#X] -Arguments: rangepartitioning(o_orderpriority#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] +Arguments: rangepartitioning(o_orderpriority#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [shuffle_writer_type=hash] (26) ShuffleQueryStage Output [2]: [o_orderpriority#X, order_count#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark34/5.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark34/5.txt index 742f9db1a0545..5dc04bd48a57a 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark34/5.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark34/5.txt @@ -325,7 +325,7 @@ Arguments: X, X (54) ColumnarExchange Input [4]: [hash_partition_key#X, n_name#X, sum#X, isEmpty#X] -Arguments: hashpartitioning(n_name#X, 1), ENSURE_REQUIREMENTS, [n_name#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(n_name#X, 1), ENSURE_REQUIREMENTS, [n_name#X, sum#X, isEmpty#X], [plan_id=X], [shuffle_writer_type=hash] (55) ShuffleQueryStage Output [3]: [n_name#X, sum#X, isEmpty#X] @@ -354,7 +354,7 @@ Arguments: X, X (61) ColumnarExchange Input [2]: [n_name#X, revenue#X] -Arguments: rangepartitioning(revenue#X DESC NULLS LAST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] +Arguments: rangepartitioning(revenue#X DESC NULLS LAST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [shuffle_writer_type=hash] (62) ShuffleQueryStage Output [2]: [n_name#X, revenue#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark34/6.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark34/6.txt index 89cf24d874744..f8169a34a98ff 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark34/6.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark34/6.txt @@ -52,7 +52,7 @@ Arguments: X, X (7) ColumnarExchange Input [2]: [sum#X, isEmpty#X] -Arguments: SinglePartition, ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] +Arguments: SinglePartition, ENSURE_REQUIREMENTS, [plan_id=X], [shuffle_writer_type=hash] (8) ShuffleQueryStage Output [2]: [sum#X, isEmpty#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark34/7.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark34/7.txt index 7b0e80cded477..d80dbd225fa8c 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark34/7.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark34/7.txt @@ -295,7 +295,7 @@ Arguments: X, X (49) ColumnarExchange Input [6]: [hash_partition_key#X, supp_nation#X, cust_nation#X, l_year#X, sum#X, isEmpty#X] -Arguments: hashpartitioning(supp_nation#X, cust_nation#X, l_year#X, 1), ENSURE_REQUIREMENTS, [supp_nation#X, cust_nation#X, l_year#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(supp_nation#X, cust_nation#X, l_year#X, 1), ENSURE_REQUIREMENTS, [supp_nation#X, cust_nation#X, l_year#X, sum#X, isEmpty#X], [plan_id=X], [shuffle_writer_type=hash] (50) ShuffleQueryStage Output [5]: [supp_nation#X, cust_nation#X, l_year#X, sum#X, isEmpty#X] @@ -324,7 +324,7 @@ Arguments: X, X (56) ColumnarExchange Input [4]: [supp_nation#X, cust_nation#X, l_year#X, revenue#X] -Arguments: rangepartitioning(supp_nation#X ASC NULLS FIRST, cust_nation#X ASC NULLS FIRST, l_year#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] +Arguments: rangepartitioning(supp_nation#X ASC NULLS FIRST, cust_nation#X ASC NULLS FIRST, l_year#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [shuffle_writer_type=hash] (57) ShuffleQueryStage Output [4]: [supp_nation#X, cust_nation#X, l_year#X, revenue#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark34/8.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark34/8.txt index 394c10ccb0123..4f035d06361dd 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark34/8.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark34/8.txt @@ -428,7 +428,7 @@ Arguments: X, X (72) ColumnarExchange Input [6]: [hash_partition_key#X, o_year#X, sum#X, isEmpty#X, sum#X, isEmpty#X] -Arguments: hashpartitioning(o_year#X, 1), ENSURE_REQUIREMENTS, [o_year#X, sum#X, isEmpty#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(o_year#X, 1), ENSURE_REQUIREMENTS, [o_year#X, sum#X, isEmpty#X, sum#X, isEmpty#X], [plan_id=X], [shuffle_writer_type=hash] (73) ShuffleQueryStage Output [5]: [o_year#X, sum#X, isEmpty#X, sum#X, isEmpty#X] @@ -461,7 +461,7 @@ Arguments: X, X (80) ColumnarExchange Input [2]: [o_year#X, mkt_share#X] -Arguments: rangepartitioning(o_year#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] +Arguments: rangepartitioning(o_year#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [shuffle_writer_type=hash] (81) ShuffleQueryStage Output [2]: [o_year#X, mkt_share#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark34/9.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark34/9.txt index f9fe0dbc9ac17..deaf8f857e670 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark34/9.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj-ras/spark34/9.txt @@ -319,7 +319,7 @@ Arguments: X, X (53) ColumnarExchange Input [5]: [hash_partition_key#X, nation#X, o_year#X, sum#X, isEmpty#X] -Arguments: hashpartitioning(nation#X, o_year#X, 1), ENSURE_REQUIREMENTS, [nation#X, o_year#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(nation#X, o_year#X, 1), ENSURE_REQUIREMENTS, [nation#X, o_year#X, sum#X, isEmpty#X], [plan_id=X], [shuffle_writer_type=hash] (54) ShuffleQueryStage Output [4]: [nation#X, o_year#X, sum#X, isEmpty#X] @@ -348,7 +348,7 @@ Arguments: X, X (60) ColumnarExchange Input [3]: [nation#X, o_year#X, sum_profit#X] -Arguments: rangepartitioning(nation#X ASC NULLS FIRST, o_year#X DESC NULLS LAST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] +Arguments: rangepartitioning(nation#X ASC NULLS FIRST, o_year#X DESC NULLS LAST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [shuffle_writer_type=hash] (61) ShuffleQueryStage Output [3]: [nation#X, o_year#X, sum_profit#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark32/1.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark32/1.txt index 5ceb73b301db8..b1b2a7507c210 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark32/1.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark32/1.txt @@ -64,7 +64,7 @@ Arguments: X, X (8) ColumnarExchange Input [18]: [hash_partition_key#X, l_returnflag#X, l_linestatus#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, count#X, sum#X, count#X, sum#X, count#X, count#X] -Arguments: hashpartitioning(l_returnflag#X, l_linestatus#X, 1), ENSURE_REQUIREMENTS, [l_returnflag#X, l_linestatus#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, count#X, sum#X, count#X, sum#X, count#X, count#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(l_returnflag#X, l_linestatus#X, 1), ENSURE_REQUIREMENTS, [l_returnflag#X, l_linestatus#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, count#X, sum#X, count#X, sum#X, count#X, count#X], [plan_id=X], [shuffle_writer_type=hash] (9) ShuffleQueryStage Output [17]: [l_returnflag#X, l_linestatus#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, count#X, sum#X, count#X, sum#X, count#X, count#X] @@ -93,7 +93,7 @@ Arguments: X, X (15) ColumnarExchange Input [10]: [l_returnflag#X, l_linestatus#X, sum_qty#X, sum_base_price#X, sum_disc_price#X, sum_charge#X, avg_qty#X, avg_price#X, avg_disc#X, count_order#X] -Arguments: rangepartitioning(l_returnflag#X ASC NULLS FIRST, l_linestatus#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] +Arguments: rangepartitioning(l_returnflag#X ASC NULLS FIRST, l_linestatus#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [shuffle_writer_type=hash] (16) ShuffleQueryStage Output [10]: [l_returnflag#X, l_linestatus#X, sum_qty#X, sum_base_price#X, sum_disc_price#X, sum_charge#X, avg_qty#X, avg_price#X, avg_disc#X, count_order#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark32/10.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark32/10.txt index 5aeddac49db50..a87354004bf3e 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark32/10.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark32/10.txt @@ -216,7 +216,7 @@ Arguments: X, X (36) ColumnarExchange Input [10]: [hash_partition_key#X, c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, sum#X, isEmpty#X] -Arguments: hashpartitioning(c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, sum#X, isEmpty#X], [plan_id=X], [shuffle_writer_type=hash] (37) ShuffleQueryStage Output [9]: [c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, sum#X, isEmpty#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark32/11.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark32/11.txt index 7d95202ba98fc..5ad4086071110 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark32/11.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark32/11.txt @@ -166,7 +166,7 @@ Arguments: X, X (26) ColumnarExchange Input [4]: [hash_partition_key#X, ps_partkey#X, sum#X, isEmpty#X] -Arguments: hashpartitioning(ps_partkey#X, 1), ENSURE_REQUIREMENTS, [ps_partkey#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(ps_partkey#X, 1), ENSURE_REQUIREMENTS, [ps_partkey#X, sum#X, isEmpty#X], [plan_id=X], [shuffle_writer_type=hash] (27) ShuffleQueryStage Output [3]: [ps_partkey#X, sum#X, isEmpty#X] @@ -199,7 +199,7 @@ Arguments: X, X (34) ColumnarExchange Input [2]: [ps_partkey#X, value#X] -Arguments: rangepartitioning(value#X DESC NULLS LAST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] +Arguments: rangepartitioning(value#X DESC NULLS LAST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [shuffle_writer_type=hash] (35) ShuffleQueryStage Output [2]: [ps_partkey#X, value#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark32/12.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark32/12.txt index 39d1eb13754eb..0bbd522e9ca72 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark32/12.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark32/12.txt @@ -114,7 +114,7 @@ Arguments: X, X (17) ColumnarExchange Input [4]: [hash_partition_key#X, l_shipmode#X, sum#X, sum#X] -Arguments: hashpartitioning(l_shipmode#X, 1), ENSURE_REQUIREMENTS, [l_shipmode#X, sum#X, sum#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(l_shipmode#X, 1), ENSURE_REQUIREMENTS, [l_shipmode#X, sum#X, sum#X], [plan_id=X], [shuffle_writer_type=hash] (18) ShuffleQueryStage Output [3]: [l_shipmode#X, sum#X, sum#X] @@ -143,7 +143,7 @@ Arguments: X, X (24) ColumnarExchange Input [3]: [l_shipmode#X, high_line_count#X, low_line_count#X] -Arguments: rangepartitioning(l_shipmode#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] +Arguments: rangepartitioning(l_shipmode#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [shuffle_writer_type=hash] (25) ShuffleQueryStage Output [3]: [l_shipmode#X, high_line_count#X, low_line_count#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark32/13.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark32/13.txt index 1033f582f2b91..eceeb7738e8b6 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark32/13.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark32/13.txt @@ -118,7 +118,7 @@ Arguments: X, X (16) ColumnarExchange Input [3]: [hash_partition_key#X, c_custkey#X, count#X] -Arguments: hashpartitioning(c_custkey#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X, count#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(c_custkey#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X, count#X], [plan_id=X], [shuffle_writer_type=hash] (17) ShuffleQueryStage Output [2]: [c_custkey#X, count#X] @@ -162,7 +162,7 @@ Arguments: X, X (26) ColumnarExchange Input [3]: [hash_partition_key#X, c_count#X, count#X] -Arguments: hashpartitioning(c_count#X, 1), ENSURE_REQUIREMENTS, [c_count#X, count#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(c_count#X, 1), ENSURE_REQUIREMENTS, [c_count#X, count#X], [plan_id=X], [shuffle_writer_type=hash] (27) ShuffleQueryStage Output [2]: [c_count#X, count#X] @@ -191,7 +191,7 @@ Arguments: X, X (33) ColumnarExchange Input [2]: [c_count#X, custdist#X] -Arguments: rangepartitioning(custdist#X DESC NULLS LAST, c_count#X DESC NULLS LAST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] +Arguments: rangepartitioning(custdist#X DESC NULLS LAST, c_count#X DESC NULLS LAST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [shuffle_writer_type=hash] (34) ShuffleQueryStage Output [2]: [c_count#X, custdist#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark32/14.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark32/14.txt index b9231f934ea8a..f991f7b32cb01 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark32/14.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark32/14.txt @@ -103,7 +103,7 @@ Arguments: X, X (16) ColumnarExchange Input [4]: [sum#X, isEmpty#X, sum#X, isEmpty#X] -Arguments: SinglePartition, ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] +Arguments: SinglePartition, ENSURE_REQUIREMENTS, [plan_id=X], [shuffle_writer_type=hash] (17) ShuffleQueryStage Output [4]: [sum#X, isEmpty#X, sum#X, isEmpty#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark32/15.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark32/15.txt index f3b74c2c51045..c5c3670b31088 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark32/15.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark32/15.txt @@ -107,7 +107,7 @@ Arguments: X, X (15) ColumnarExchange Input [4]: [hash_partition_key#X, l_suppkey#X, sum#X, isEmpty#X] -Arguments: hashpartitioning(l_suppkey#X, 1), ENSURE_REQUIREMENTS, [l_suppkey#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(l_suppkey#X, 1), ENSURE_REQUIREMENTS, [l_suppkey#X, sum#X, isEmpty#X], [plan_id=X], [shuffle_writer_type=hash] (16) ShuffleQueryStage Output [3]: [l_suppkey#X, sum#X, isEmpty#X] @@ -149,7 +149,7 @@ Arguments: X, X (25) ColumnarExchange Input [5]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X, total_revenue#X] -Arguments: rangepartitioning(s_suppkey#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] +Arguments: rangepartitioning(s_suppkey#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [shuffle_writer_type=hash] (26) ShuffleQueryStage Output [5]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X, total_revenue#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark32/16.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark32/16.txt index 440e56487b6f3..51e5f9c64466d 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark32/16.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark32/16.txt @@ -123,7 +123,7 @@ Arguments: X, X (16) ColumnarExchange Input [5]: [hash_partition_key#X, p_brand#X, p_type#X, p_size#X, ps_suppkey#X] -Arguments: hashpartitioning(p_brand#X, p_type#X, p_size#X, ps_suppkey#X, 1), ENSURE_REQUIREMENTS, [p_brand#X, p_type#X, p_size#X, ps_suppkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(p_brand#X, p_type#X, p_size#X, ps_suppkey#X, 1), ENSURE_REQUIREMENTS, [p_brand#X, p_type#X, p_size#X, ps_suppkey#X], [plan_id=X], [shuffle_writer_type=hash] (17) ShuffleQueryStage Output [4]: [p_brand#X, p_type#X, p_size#X, ps_suppkey#X] @@ -163,7 +163,7 @@ Arguments: X, X (25) ColumnarExchange Input [5]: [hash_partition_key#X, p_brand#X, p_type#X, p_size#X, count#X] -Arguments: hashpartitioning(p_brand#X, p_type#X, p_size#X, 1), ENSURE_REQUIREMENTS, [p_brand#X, p_type#X, p_size#X, count#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(p_brand#X, p_type#X, p_size#X, 1), ENSURE_REQUIREMENTS, [p_brand#X, p_type#X, p_size#X, count#X], [plan_id=X], [shuffle_writer_type=hash] (26) ShuffleQueryStage Output [4]: [p_brand#X, p_type#X, p_size#X, count#X] @@ -192,7 +192,7 @@ Arguments: X, X (32) ColumnarExchange Input [4]: [p_brand#X, p_type#X, p_size#X, supplier_cnt#X] -Arguments: rangepartitioning(supplier_cnt#X DESC NULLS LAST, p_brand#X ASC NULLS FIRST, p_type#X ASC NULLS FIRST, p_size#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] +Arguments: rangepartitioning(supplier_cnt#X DESC NULLS LAST, p_brand#X ASC NULLS FIRST, p_type#X ASC NULLS FIRST, p_size#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [shuffle_writer_type=hash] (33) ShuffleQueryStage Output [4]: [p_brand#X, p_type#X, p_size#X, supplier_cnt#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark32/17.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark32/17.txt index 7f4e4e666c0d6..200674c0b1326 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark32/17.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark32/17.txt @@ -65,7 +65,7 @@ Arguments: X, X (8) ColumnarExchange Input [2]: [sum#X, isEmpty#X] -Arguments: SinglePartition, ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] +Arguments: SinglePartition, ENSURE_REQUIREMENTS, [plan_id=X], [shuffle_writer_type=hash] (9) ShuffleQueryStage Output [2]: [sum#X, isEmpty#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark32/18.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark32/18.txt index 6da312afa5f9a..d6528a10e0d6a 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark32/18.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark32/18.txt @@ -146,7 +146,7 @@ Arguments: X, X (15) ColumnarExchange Input [4]: [hash_partition_key#X, l_orderkey#X, sum#X, isEmpty#X] -Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, sum#X, isEmpty#X], [plan_id=X], [shuffle_writer_type=hash] (16) ShuffleQueryStage Output [3]: [l_orderkey#X, sum#X, isEmpty#X] @@ -282,7 +282,7 @@ Arguments: X, X (48) ColumnarExchange Input [8]: [hash_partition_key#X, c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum#X, isEmpty#X] -Arguments: hashpartitioning(c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, 1), ENSURE_REQUIREMENTS, [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, 1), ENSURE_REQUIREMENTS, [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum#X, isEmpty#X], [plan_id=X], [shuffle_writer_type=hash] (49) ShuffleQueryStage Output [7]: [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum#X, isEmpty#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark32/19.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark32/19.txt index 835386f95fdc7..218c7e7d36823 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark32/19.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark32/19.txt @@ -102,7 +102,7 @@ Arguments: X, X (16) ColumnarExchange Input [2]: [sum#X, isEmpty#X] -Arguments: SinglePartition, ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] +Arguments: SinglePartition, ENSURE_REQUIREMENTS, [plan_id=X], [shuffle_writer_type=hash] (17) ShuffleQueryStage Output [2]: [sum#X, isEmpty#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark32/20.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark32/20.txt index 51eaaa2a51b9b..925c9b4c1df78 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark32/20.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark32/20.txt @@ -124,7 +124,7 @@ Arguments: X, X (6) ColumnarExchange Input [5]: [hash_partition_key#X, s_suppkey#X, s_name#X, s_address#X, s_nationkey#X] -Arguments: hashpartitioning(s_suppkey#X, 1), ENSURE_REQUIREMENTS, [s_suppkey#X, s_name#X, s_address#X, s_nationkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(s_suppkey#X, 1), ENSURE_REQUIREMENTS, [s_suppkey#X, s_name#X, s_address#X, s_nationkey#X], [plan_id=X], [shuffle_writer_type=hash] (7) ShuffleQueryStage Output [4]: [s_suppkey#X, s_name#X, s_address#X, s_nationkey#X] @@ -261,7 +261,7 @@ Arguments: X, X (39) ColumnarExchange Input [5]: [hash_partition_key#X, l_partkey#X, l_suppkey#X, sum#X, isEmpty#X] -Arguments: hashpartitioning(l_partkey#X, l_suppkey#X, 1), ENSURE_REQUIREMENTS, [l_partkey#X, l_suppkey#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(l_partkey#X, l_suppkey#X, 1), ENSURE_REQUIREMENTS, [l_partkey#X, l_suppkey#X, sum#X, isEmpty#X], [plan_id=X], [shuffle_writer_type=hash] (40) ShuffleQueryStage Output [4]: [l_partkey#X, l_suppkey#X, sum#X, isEmpty#X] @@ -376,7 +376,7 @@ Arguments: X, X (67) ColumnarExchange Input [2]: [s_name#X, s_address#X] -Arguments: rangepartitioning(s_name#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] +Arguments: rangepartitioning(s_name#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [shuffle_writer_type=hash] (68) ShuffleQueryStage Output [2]: [s_name#X, s_address#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark32/21.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark32/21.txt index 875f05406e856..0ef77a3b40580 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark32/21.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark32/21.txt @@ -308,7 +308,7 @@ Arguments: X, X (53) ColumnarExchange Input [3]: [hash_partition_key#X, s_name#X, count#X] -Arguments: hashpartitioning(s_name#X, 1), ENSURE_REQUIREMENTS, [s_name#X, count#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(s_name#X, 1), ENSURE_REQUIREMENTS, [s_name#X, count#X], [plan_id=X], [shuffle_writer_type=hash] (54) ShuffleQueryStage Output [2]: [s_name#X, count#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark32/22.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark32/22.txt index e0b3b12c7b964..e3cda8bcfc8b1 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark32/22.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark32/22.txt @@ -101,7 +101,7 @@ Arguments: X, X (15) ColumnarExchange Input [5]: [hash_partition_key#X, cntrycode#X, count#X, sum#X, isEmpty#X] -Arguments: hashpartitioning(cntrycode#X, 1), ENSURE_REQUIREMENTS, [cntrycode#X, count#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(cntrycode#X, 1), ENSURE_REQUIREMENTS, [cntrycode#X, count#X, sum#X, isEmpty#X], [plan_id=X], [shuffle_writer_type=hash] (16) ShuffleQueryStage Output [4]: [cntrycode#X, count#X, sum#X, isEmpty#X] @@ -130,7 +130,7 @@ Arguments: X, X (22) ColumnarExchange Input [3]: [cntrycode#X, numcust#X, totacctbal#X] -Arguments: rangepartitioning(cntrycode#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] +Arguments: rangepartitioning(cntrycode#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [shuffle_writer_type=hash] (23) ShuffleQueryStage Output [3]: [cntrycode#X, numcust#X, totacctbal#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark32/3.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark32/3.txt index ed5f34e91889e..4e3cdc99706b0 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark32/3.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark32/3.txt @@ -166,7 +166,7 @@ Arguments: X, X (27) ColumnarExchange Input [6]: [hash_partition_key#X, l_orderkey#X, o_orderdate#X, o_shippriority#X, sum#X, isEmpty#X] -Arguments: hashpartitioning(l_orderkey#X, o_orderdate#X, o_shippriority#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, o_orderdate#X, o_shippriority#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(l_orderkey#X, o_orderdate#X, o_shippriority#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, o_orderdate#X, o_shippriority#X, sum#X, isEmpty#X], [plan_id=X], [shuffle_writer_type=hash] (28) ShuffleQueryStage Output [5]: [l_orderkey#X, o_orderdate#X, o_shippriority#X, sum#X, isEmpty#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark32/4.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark32/4.txt index a24a32699b79f..1332c8ba96637 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark32/4.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark32/4.txt @@ -120,7 +120,7 @@ Arguments: X, X (18) ColumnarExchange Input [3]: [hash_partition_key#X, o_orderpriority#X, count#X] -Arguments: hashpartitioning(o_orderpriority#X, 1), ENSURE_REQUIREMENTS, [o_orderpriority#X, count#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(o_orderpriority#X, 1), ENSURE_REQUIREMENTS, [o_orderpriority#X, count#X], [plan_id=X], [shuffle_writer_type=hash] (19) ShuffleQueryStage Output [2]: [o_orderpriority#X, count#X] @@ -149,7 +149,7 @@ Arguments: X, X (25) ColumnarExchange Input [2]: [o_orderpriority#X, order_count#X] -Arguments: rangepartitioning(o_orderpriority#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] +Arguments: rangepartitioning(o_orderpriority#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [shuffle_writer_type=hash] (26) ShuffleQueryStage Output [2]: [o_orderpriority#X, order_count#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark32/5.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark32/5.txt index 178ee7c7e2f38..1445f75cfcb42 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark32/5.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark32/5.txt @@ -320,7 +320,7 @@ Arguments: X, X (54) ColumnarExchange Input [4]: [hash_partition_key#X, n_name#X, sum#X, isEmpty#X] -Arguments: hashpartitioning(n_name#X, 1), ENSURE_REQUIREMENTS, [n_name#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(n_name#X, 1), ENSURE_REQUIREMENTS, [n_name#X, sum#X, isEmpty#X], [plan_id=X], [shuffle_writer_type=hash] (55) ShuffleQueryStage Output [3]: [n_name#X, sum#X, isEmpty#X] @@ -349,7 +349,7 @@ Arguments: X, X (61) ColumnarExchange Input [2]: [n_name#X, revenue#X] -Arguments: rangepartitioning(revenue#X DESC NULLS LAST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] +Arguments: rangepartitioning(revenue#X DESC NULLS LAST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [shuffle_writer_type=hash] (62) ShuffleQueryStage Output [2]: [n_name#X, revenue#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark32/6.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark32/6.txt index ee8c494f56a30..2b97fd28f1476 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark32/6.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark32/6.txt @@ -52,7 +52,7 @@ Arguments: X, X (7) ColumnarExchange Input [2]: [sum#X, isEmpty#X] -Arguments: SinglePartition, ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] +Arguments: SinglePartition, ENSURE_REQUIREMENTS, [plan_id=X], [shuffle_writer_type=hash] (8) ShuffleQueryStage Output [2]: [sum#X, isEmpty#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark32/7.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark32/7.txt index f0e7e37d3887e..5f3cf3fa7c566 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark32/7.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark32/7.txt @@ -290,7 +290,7 @@ Arguments: X, X (49) ColumnarExchange Input [6]: [hash_partition_key#X, supp_nation#X, cust_nation#X, l_year#X, sum#X, isEmpty#X] -Arguments: hashpartitioning(supp_nation#X, cust_nation#X, l_year#X, 1), ENSURE_REQUIREMENTS, [supp_nation#X, cust_nation#X, l_year#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(supp_nation#X, cust_nation#X, l_year#X, 1), ENSURE_REQUIREMENTS, [supp_nation#X, cust_nation#X, l_year#X, sum#X, isEmpty#X], [plan_id=X], [shuffle_writer_type=hash] (50) ShuffleQueryStage Output [5]: [supp_nation#X, cust_nation#X, l_year#X, sum#X, isEmpty#X] @@ -319,7 +319,7 @@ Arguments: X, X (56) ColumnarExchange Input [4]: [supp_nation#X, cust_nation#X, l_year#X, revenue#X] -Arguments: rangepartitioning(supp_nation#X ASC NULLS FIRST, cust_nation#X ASC NULLS FIRST, l_year#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] +Arguments: rangepartitioning(supp_nation#X ASC NULLS FIRST, cust_nation#X ASC NULLS FIRST, l_year#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [shuffle_writer_type=hash] (57) ShuffleQueryStage Output [4]: [supp_nation#X, cust_nation#X, l_year#X, revenue#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark32/8.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark32/8.txt index 9d9abd6b04bf3..a64588c5ba985 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark32/8.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark32/8.txt @@ -421,7 +421,7 @@ Arguments: X, X (72) ColumnarExchange Input [6]: [hash_partition_key#X, o_year#X, sum#X, isEmpty#X, sum#X, isEmpty#X] -Arguments: hashpartitioning(o_year#X, 1), ENSURE_REQUIREMENTS, [o_year#X, sum#X, isEmpty#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(o_year#X, 1), ENSURE_REQUIREMENTS, [o_year#X, sum#X, isEmpty#X, sum#X, isEmpty#X], [plan_id=X], [shuffle_writer_type=hash] (73) ShuffleQueryStage Output [5]: [o_year#X, sum#X, isEmpty#X, sum#X, isEmpty#X] @@ -454,7 +454,7 @@ Arguments: X, X (80) ColumnarExchange Input [2]: [o_year#X, mkt_share#X] -Arguments: rangepartitioning(o_year#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] +Arguments: rangepartitioning(o_year#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [shuffle_writer_type=hash] (81) ShuffleQueryStage Output [2]: [o_year#X, mkt_share#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark32/9.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark32/9.txt index 2b213b681a052..45bfb4c1996df 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark32/9.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark32/9.txt @@ -314,7 +314,7 @@ Arguments: X, X (53) ColumnarExchange Input [5]: [hash_partition_key#X, nation#X, o_year#X, sum#X, isEmpty#X] -Arguments: hashpartitioning(nation#X, o_year#X, 1), ENSURE_REQUIREMENTS, [nation#X, o_year#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(nation#X, o_year#X, 1), ENSURE_REQUIREMENTS, [nation#X, o_year#X, sum#X, isEmpty#X], [plan_id=X], [shuffle_writer_type=hash] (54) ShuffleQueryStage Output [4]: [nation#X, o_year#X, sum#X, isEmpty#X] @@ -343,7 +343,7 @@ Arguments: X, X (60) ColumnarExchange Input [3]: [nation#X, o_year#X, sum_profit#X] -Arguments: rangepartitioning(nation#X ASC NULLS FIRST, o_year#X DESC NULLS LAST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] +Arguments: rangepartitioning(nation#X ASC NULLS FIRST, o_year#X DESC NULLS LAST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [shuffle_writer_type=hash] (61) ShuffleQueryStage Output [3]: [nation#X, o_year#X, sum_profit#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark33/1.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark33/1.txt index 12c19c45e38d1..417ddf9de25d0 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark33/1.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark33/1.txt @@ -64,7 +64,7 @@ Arguments: X, X (8) ColumnarExchange Input [18]: [hash_partition_key#X, l_returnflag#X, l_linestatus#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, count#X, sum#X, count#X, sum#X, count#X, count#X] -Arguments: hashpartitioning(l_returnflag#X, l_linestatus#X, 1), ENSURE_REQUIREMENTS, [l_returnflag#X, l_linestatus#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, count#X, sum#X, count#X, sum#X, count#X, count#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(l_returnflag#X, l_linestatus#X, 1), ENSURE_REQUIREMENTS, [l_returnflag#X, l_linestatus#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, count#X, sum#X, count#X, sum#X, count#X, count#X], [plan_id=X], [shuffle_writer_type=hash] (9) ShuffleQueryStage Output [17]: [l_returnflag#X, l_linestatus#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, count#X, sum#X, count#X, sum#X, count#X, count#X] @@ -93,7 +93,7 @@ Arguments: X, X (15) ColumnarExchange Input [10]: [l_returnflag#X, l_linestatus#X, sum_qty#X, sum_base_price#X, sum_disc_price#X, sum_charge#X, avg_qty#X, avg_price#X, avg_disc#X, count_order#X] -Arguments: rangepartitioning(l_returnflag#X ASC NULLS FIRST, l_linestatus#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] +Arguments: rangepartitioning(l_returnflag#X ASC NULLS FIRST, l_linestatus#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [shuffle_writer_type=hash] (16) ShuffleQueryStage Output [10]: [l_returnflag#X, l_linestatus#X, sum_qty#X, sum_base_price#X, sum_disc_price#X, sum_charge#X, avg_qty#X, avg_price#X, avg_disc#X, count_order#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark33/10.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark33/10.txt index adaeaf49efeeb..7b8d0238637da 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark33/10.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark33/10.txt @@ -216,7 +216,7 @@ Arguments: X, X (36) ColumnarExchange Input [10]: [hash_partition_key#X, c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, sum#X, isEmpty#X] -Arguments: hashpartitioning(c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, sum#X, isEmpty#X], [plan_id=X], [shuffle_writer_type=hash] (37) ShuffleQueryStage Output [9]: [c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, sum#X, isEmpty#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark33/11.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark33/11.txt index 25d3c4ac2f75b..e01bfff2886b1 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark33/11.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark33/11.txt @@ -166,7 +166,7 @@ Arguments: X, X (26) ColumnarExchange Input [4]: [hash_partition_key#X, ps_partkey#X, sum#X, isEmpty#X] -Arguments: hashpartitioning(ps_partkey#X, 1), ENSURE_REQUIREMENTS, [ps_partkey#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(ps_partkey#X, 1), ENSURE_REQUIREMENTS, [ps_partkey#X, sum#X, isEmpty#X], [plan_id=X], [shuffle_writer_type=hash] (27) ShuffleQueryStage Output [3]: [ps_partkey#X, sum#X, isEmpty#X] @@ -199,7 +199,7 @@ Arguments: X, X (34) ColumnarExchange Input [2]: [ps_partkey#X, value#X] -Arguments: rangepartitioning(value#X DESC NULLS LAST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] +Arguments: rangepartitioning(value#X DESC NULLS LAST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [shuffle_writer_type=hash] (35) ShuffleQueryStage Output [2]: [ps_partkey#X, value#X] @@ -435,7 +435,7 @@ Arguments: X, X (78) ColumnarExchange Input [2]: [sum#X, isEmpty#X] -Arguments: SinglePartition, ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] +Arguments: SinglePartition, ENSURE_REQUIREMENTS, [plan_id=X], [shuffle_writer_type=hash] (79) ShuffleQueryStage Output [2]: [sum#X, isEmpty#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark33/12.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark33/12.txt index 21681feeacc14..490806d71daaf 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark33/12.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark33/12.txt @@ -114,7 +114,7 @@ Arguments: X, X (17) ColumnarExchange Input [4]: [hash_partition_key#X, l_shipmode#X, sum#X, sum#X] -Arguments: hashpartitioning(l_shipmode#X, 1), ENSURE_REQUIREMENTS, [l_shipmode#X, sum#X, sum#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(l_shipmode#X, 1), ENSURE_REQUIREMENTS, [l_shipmode#X, sum#X, sum#X], [plan_id=X], [shuffle_writer_type=hash] (18) ShuffleQueryStage Output [3]: [l_shipmode#X, sum#X, sum#X] @@ -143,7 +143,7 @@ Arguments: X, X (24) ColumnarExchange Input [3]: [l_shipmode#X, high_line_count#X, low_line_count#X] -Arguments: rangepartitioning(l_shipmode#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] +Arguments: rangepartitioning(l_shipmode#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [shuffle_writer_type=hash] (25) ShuffleQueryStage Output [3]: [l_shipmode#X, high_line_count#X, low_line_count#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark33/13.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark33/13.txt index 808e8189cca4f..c3ac3e0903c8a 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark33/13.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark33/13.txt @@ -118,7 +118,7 @@ Arguments: X, X (16) ColumnarExchange Input [3]: [hash_partition_key#X, c_custkey#X, count#X] -Arguments: hashpartitioning(c_custkey#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X, count#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(c_custkey#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X, count#X], [plan_id=X], [shuffle_writer_type=hash] (17) ShuffleQueryStage Output [2]: [c_custkey#X, count#X] @@ -162,7 +162,7 @@ Arguments: X, X (26) ColumnarExchange Input [3]: [hash_partition_key#X, c_count#X, count#X] -Arguments: hashpartitioning(c_count#X, 1), ENSURE_REQUIREMENTS, [c_count#X, count#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(c_count#X, 1), ENSURE_REQUIREMENTS, [c_count#X, count#X], [plan_id=X], [shuffle_writer_type=hash] (27) ShuffleQueryStage Output [2]: [c_count#X, count#X] @@ -191,7 +191,7 @@ Arguments: X, X (33) ColumnarExchange Input [2]: [c_count#X, custdist#X] -Arguments: rangepartitioning(custdist#X DESC NULLS LAST, c_count#X DESC NULLS LAST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] +Arguments: rangepartitioning(custdist#X DESC NULLS LAST, c_count#X DESC NULLS LAST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [shuffle_writer_type=hash] (34) ShuffleQueryStage Output [2]: [c_count#X, custdist#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark33/14.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark33/14.txt index 6ac8b1cd94d36..67bce21483fb1 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark33/14.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark33/14.txt @@ -103,7 +103,7 @@ Arguments: X, X (16) ColumnarExchange Input [4]: [sum#X, isEmpty#X, sum#X, isEmpty#X] -Arguments: SinglePartition, ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] +Arguments: SinglePartition, ENSURE_REQUIREMENTS, [plan_id=X], [shuffle_writer_type=hash] (17) ShuffleQueryStage Output [4]: [sum#X, isEmpty#X, sum#X, isEmpty#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark33/15.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark33/15.txt index 671000b2f28d2..9a7879d5c1d16 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark33/15.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark33/15.txt @@ -106,7 +106,7 @@ Arguments: X, X (15) ColumnarExchange Input [4]: [hash_partition_key#X, l_suppkey#X, sum#X, isEmpty#X] -Arguments: hashpartitioning(l_suppkey#X, 1), ENSURE_REQUIREMENTS, [l_suppkey#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(l_suppkey#X, 1), ENSURE_REQUIREMENTS, [l_suppkey#X, sum#X, isEmpty#X], [plan_id=X], [shuffle_writer_type=hash] (16) ShuffleQueryStage Output [3]: [l_suppkey#X, sum#X, isEmpty#X] @@ -148,7 +148,7 @@ Arguments: X, X (25) ColumnarExchange Input [5]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X, total_revenue#X] -Arguments: rangepartitioning(s_suppkey#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] +Arguments: rangepartitioning(s_suppkey#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [shuffle_writer_type=hash] (26) ShuffleQueryStage Output [5]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X, total_revenue#X] @@ -300,7 +300,7 @@ Arguments: X, X (51) ColumnarExchange Input [4]: [hash_partition_key#X, l_suppkey#X, sum#X, isEmpty#X] -Arguments: hashpartitioning(l_suppkey#X, 1), ENSURE_REQUIREMENTS, [l_suppkey#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(l_suppkey#X, 1), ENSURE_REQUIREMENTS, [l_suppkey#X, sum#X, isEmpty#X], [plan_id=X], [shuffle_writer_type=hash] (52) ShuffleQueryStage Output [3]: [l_suppkey#X, sum#X, isEmpty#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark33/16.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark33/16.txt index 19e334e888ec5..6faaec51bb6b0 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark33/16.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark33/16.txt @@ -123,7 +123,7 @@ Arguments: X, X (16) ColumnarExchange Input [5]: [hash_partition_key#X, p_brand#X, p_type#X, p_size#X, ps_suppkey#X] -Arguments: hashpartitioning(p_brand#X, p_type#X, p_size#X, ps_suppkey#X, 1), ENSURE_REQUIREMENTS, [p_brand#X, p_type#X, p_size#X, ps_suppkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(p_brand#X, p_type#X, p_size#X, ps_suppkey#X, 1), ENSURE_REQUIREMENTS, [p_brand#X, p_type#X, p_size#X, ps_suppkey#X], [plan_id=X], [shuffle_writer_type=hash] (17) ShuffleQueryStage Output [4]: [p_brand#X, p_type#X, p_size#X, ps_suppkey#X] @@ -163,7 +163,7 @@ Arguments: X, X (25) ColumnarExchange Input [5]: [hash_partition_key#X, p_brand#X, p_type#X, p_size#X, count#X] -Arguments: hashpartitioning(p_brand#X, p_type#X, p_size#X, 1), ENSURE_REQUIREMENTS, [p_brand#X, p_type#X, p_size#X, count#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(p_brand#X, p_type#X, p_size#X, 1), ENSURE_REQUIREMENTS, [p_brand#X, p_type#X, p_size#X, count#X], [plan_id=X], [shuffle_writer_type=hash] (26) ShuffleQueryStage Output [4]: [p_brand#X, p_type#X, p_size#X, count#X] @@ -192,7 +192,7 @@ Arguments: X, X (32) ColumnarExchange Input [4]: [p_brand#X, p_type#X, p_size#X, supplier_cnt#X] -Arguments: rangepartitioning(supplier_cnt#X DESC NULLS LAST, p_brand#X ASC NULLS FIRST, p_type#X ASC NULLS FIRST, p_size#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] +Arguments: rangepartitioning(supplier_cnt#X DESC NULLS LAST, p_brand#X ASC NULLS FIRST, p_type#X ASC NULLS FIRST, p_size#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [shuffle_writer_type=hash] (33) ShuffleQueryStage Output [4]: [p_brand#X, p_type#X, p_size#X, supplier_cnt#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark33/17.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark33/17.txt index 38801e9f690a8..d0d18bdf816a8 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark33/17.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark33/17.txt @@ -65,7 +65,7 @@ Arguments: X, X (8) ColumnarExchange Input [2]: [sum#X, isEmpty#X] -Arguments: SinglePartition, ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] +Arguments: SinglePartition, ENSURE_REQUIREMENTS, [plan_id=X], [shuffle_writer_type=hash] (9) ShuffleQueryStage Output [2]: [sum#X, isEmpty#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark33/18.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark33/18.txt index 26995352c1cfe..661a04a3db6ab 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark33/18.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark33/18.txt @@ -146,7 +146,7 @@ Arguments: X, X (15) ColumnarExchange Input [4]: [hash_partition_key#X, l_orderkey#X, sum#X, isEmpty#X] -Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, sum#X, isEmpty#X], [plan_id=X], [shuffle_writer_type=hash] (16) ShuffleQueryStage Output [3]: [l_orderkey#X, sum#X, isEmpty#X] @@ -282,7 +282,7 @@ Arguments: X, X (48) ColumnarExchange Input [8]: [hash_partition_key#X, c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum#X, isEmpty#X] -Arguments: hashpartitioning(c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, 1), ENSURE_REQUIREMENTS, [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, 1), ENSURE_REQUIREMENTS, [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum#X, isEmpty#X], [plan_id=X], [shuffle_writer_type=hash] (49) ShuffleQueryStage Output [7]: [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum#X, isEmpty#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark33/19.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark33/19.txt index ce6a3f10da038..47e5d26dbdd68 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark33/19.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark33/19.txt @@ -102,7 +102,7 @@ Arguments: X, X (16) ColumnarExchange Input [2]: [sum#X, isEmpty#X] -Arguments: SinglePartition, ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] +Arguments: SinglePartition, ENSURE_REQUIREMENTS, [plan_id=X], [shuffle_writer_type=hash] (17) ShuffleQueryStage Output [2]: [sum#X, isEmpty#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark33/20.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark33/20.txt index 125e495397e8f..b15eb6d9c3654 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark33/20.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark33/20.txt @@ -123,7 +123,7 @@ Arguments: X, X (6) ColumnarExchange Input [5]: [hash_partition_key#X, s_suppkey#X, s_name#X, s_address#X, s_nationkey#X] -Arguments: hashpartitioning(s_suppkey#X, 1), ENSURE_REQUIREMENTS, [s_suppkey#X, s_name#X, s_address#X, s_nationkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(s_suppkey#X, 1), ENSURE_REQUIREMENTS, [s_suppkey#X, s_name#X, s_address#X, s_nationkey#X], [plan_id=X], [shuffle_writer_type=hash] (7) ShuffleQueryStage Output [4]: [s_suppkey#X, s_name#X, s_address#X, s_nationkey#X] @@ -260,7 +260,7 @@ Arguments: X, X (39) ColumnarExchange Input [5]: [hash_partition_key#X, l_partkey#X, l_suppkey#X, sum#X, isEmpty#X] -Arguments: hashpartitioning(l_partkey#X, l_suppkey#X, 1), ENSURE_REQUIREMENTS, [l_partkey#X, l_suppkey#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(l_partkey#X, l_suppkey#X, 1), ENSURE_REQUIREMENTS, [l_partkey#X, l_suppkey#X, sum#X, isEmpty#X], [plan_id=X], [shuffle_writer_type=hash] (40) ShuffleQueryStage Output [4]: [l_partkey#X, l_suppkey#X, sum#X, isEmpty#X] @@ -375,7 +375,7 @@ Arguments: X, X (67) ColumnarExchange Input [2]: [s_name#X, s_address#X] -Arguments: rangepartitioning(s_name#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] +Arguments: rangepartitioning(s_name#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [shuffle_writer_type=hash] (68) ShuffleQueryStage Output [2]: [s_name#X, s_address#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark33/21.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark33/21.txt index 6e411317dd33f..8a2bf8db611d8 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark33/21.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark33/21.txt @@ -307,7 +307,7 @@ Arguments: X, X (53) ColumnarExchange Input [3]: [hash_partition_key#X, s_name#X, count#X] -Arguments: hashpartitioning(s_name#X, 1), ENSURE_REQUIREMENTS, [s_name#X, count#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(s_name#X, 1), ENSURE_REQUIREMENTS, [s_name#X, count#X], [plan_id=X], [shuffle_writer_type=hash] (54) ShuffleQueryStage Output [2]: [s_name#X, count#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark33/22.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark33/22.txt index a4883c6228f4a..e7ea95ab686f8 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark33/22.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark33/22.txt @@ -101,7 +101,7 @@ Arguments: X, X (15) ColumnarExchange Input [5]: [hash_partition_key#X, cntrycode#X, count#X, sum#X, isEmpty#X] -Arguments: hashpartitioning(cntrycode#X, 1), ENSURE_REQUIREMENTS, [cntrycode#X, count#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(cntrycode#X, 1), ENSURE_REQUIREMENTS, [cntrycode#X, count#X, sum#X, isEmpty#X], [plan_id=X], [shuffle_writer_type=hash] (16) ShuffleQueryStage Output [4]: [cntrycode#X, count#X, sum#X, isEmpty#X] @@ -130,7 +130,7 @@ Arguments: X, X (22) ColumnarExchange Input [3]: [cntrycode#X, numcust#X, totacctbal#X] -Arguments: rangepartitioning(cntrycode#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] +Arguments: rangepartitioning(cntrycode#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [shuffle_writer_type=hash] (23) ShuffleQueryStage Output [3]: [cntrycode#X, numcust#X, totacctbal#X] @@ -269,7 +269,7 @@ Arguments: X, X (47) ColumnarExchange Input [2]: [sum#X, count#X] -Arguments: SinglePartition, ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] +Arguments: SinglePartition, ENSURE_REQUIREMENTS, [plan_id=X], [shuffle_writer_type=hash] (48) ShuffleQueryStage Output [2]: [sum#X, count#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark33/3.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark33/3.txt index 41aa9b576a9d8..03ff6cc3d00ed 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark33/3.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark33/3.txt @@ -166,7 +166,7 @@ Arguments: X, X (27) ColumnarExchange Input [6]: [hash_partition_key#X, l_orderkey#X, o_orderdate#X, o_shippriority#X, sum#X, isEmpty#X] -Arguments: hashpartitioning(l_orderkey#X, o_orderdate#X, o_shippriority#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, o_orderdate#X, o_shippriority#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(l_orderkey#X, o_orderdate#X, o_shippriority#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, o_orderdate#X, o_shippriority#X, sum#X, isEmpty#X], [plan_id=X], [shuffle_writer_type=hash] (28) ShuffleQueryStage Output [5]: [l_orderkey#X, o_orderdate#X, o_shippriority#X, sum#X, isEmpty#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark33/4.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark33/4.txt index 7f87abe3e7a28..c6f0f246e5fec 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark33/4.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark33/4.txt @@ -120,7 +120,7 @@ Arguments: X, X (18) ColumnarExchange Input [3]: [hash_partition_key#X, o_orderpriority#X, count#X] -Arguments: hashpartitioning(o_orderpriority#X, 1), ENSURE_REQUIREMENTS, [o_orderpriority#X, count#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(o_orderpriority#X, 1), ENSURE_REQUIREMENTS, [o_orderpriority#X, count#X], [plan_id=X], [shuffle_writer_type=hash] (19) ShuffleQueryStage Output [2]: [o_orderpriority#X, count#X] @@ -149,7 +149,7 @@ Arguments: X, X (25) ColumnarExchange Input [2]: [o_orderpriority#X, order_count#X] -Arguments: rangepartitioning(o_orderpriority#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] +Arguments: rangepartitioning(o_orderpriority#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [shuffle_writer_type=hash] (26) ShuffleQueryStage Output [2]: [o_orderpriority#X, order_count#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark33/5.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark33/5.txt index 6780a88b37da3..213c8e698e9d5 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark33/5.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark33/5.txt @@ -320,7 +320,7 @@ Arguments: X, X (54) ColumnarExchange Input [4]: [hash_partition_key#X, n_name#X, sum#X, isEmpty#X] -Arguments: hashpartitioning(n_name#X, 1), ENSURE_REQUIREMENTS, [n_name#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(n_name#X, 1), ENSURE_REQUIREMENTS, [n_name#X, sum#X, isEmpty#X], [plan_id=X], [shuffle_writer_type=hash] (55) ShuffleQueryStage Output [3]: [n_name#X, sum#X, isEmpty#X] @@ -349,7 +349,7 @@ Arguments: X, X (61) ColumnarExchange Input [2]: [n_name#X, revenue#X] -Arguments: rangepartitioning(revenue#X DESC NULLS LAST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] +Arguments: rangepartitioning(revenue#X DESC NULLS LAST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [shuffle_writer_type=hash] (62) ShuffleQueryStage Output [2]: [n_name#X, revenue#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark33/6.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark33/6.txt index 864cdf3832e13..bb9d04cdcdc78 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark33/6.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark33/6.txt @@ -52,7 +52,7 @@ Arguments: X, X (7) ColumnarExchange Input [2]: [sum#X, isEmpty#X] -Arguments: SinglePartition, ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] +Arguments: SinglePartition, ENSURE_REQUIREMENTS, [plan_id=X], [shuffle_writer_type=hash] (8) ShuffleQueryStage Output [2]: [sum#X, isEmpty#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark33/7.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark33/7.txt index 5f81d7e4298bd..3f54f20f050e7 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark33/7.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark33/7.txt @@ -290,7 +290,7 @@ Arguments: X, X (49) ColumnarExchange Input [6]: [hash_partition_key#X, supp_nation#X, cust_nation#X, l_year#X, sum#X, isEmpty#X] -Arguments: hashpartitioning(supp_nation#X, cust_nation#X, l_year#X, 1), ENSURE_REQUIREMENTS, [supp_nation#X, cust_nation#X, l_year#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(supp_nation#X, cust_nation#X, l_year#X, 1), ENSURE_REQUIREMENTS, [supp_nation#X, cust_nation#X, l_year#X, sum#X, isEmpty#X], [plan_id=X], [shuffle_writer_type=hash] (50) ShuffleQueryStage Output [5]: [supp_nation#X, cust_nation#X, l_year#X, sum#X, isEmpty#X] @@ -319,7 +319,7 @@ Arguments: X, X (56) ColumnarExchange Input [4]: [supp_nation#X, cust_nation#X, l_year#X, revenue#X] -Arguments: rangepartitioning(supp_nation#X ASC NULLS FIRST, cust_nation#X ASC NULLS FIRST, l_year#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] +Arguments: rangepartitioning(supp_nation#X ASC NULLS FIRST, cust_nation#X ASC NULLS FIRST, l_year#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [shuffle_writer_type=hash] (57) ShuffleQueryStage Output [4]: [supp_nation#X, cust_nation#X, l_year#X, revenue#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark33/8.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark33/8.txt index 292fc5bfd4a84..ba0d25c60800f 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark33/8.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark33/8.txt @@ -421,7 +421,7 @@ Arguments: X, X (72) ColumnarExchange Input [6]: [hash_partition_key#X, o_year#X, sum#X, isEmpty#X, sum#X, isEmpty#X] -Arguments: hashpartitioning(o_year#X, 1), ENSURE_REQUIREMENTS, [o_year#X, sum#X, isEmpty#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(o_year#X, 1), ENSURE_REQUIREMENTS, [o_year#X, sum#X, isEmpty#X, sum#X, isEmpty#X], [plan_id=X], [shuffle_writer_type=hash] (73) ShuffleQueryStage Output [5]: [o_year#X, sum#X, isEmpty#X, sum#X, isEmpty#X] @@ -454,7 +454,7 @@ Arguments: X, X (80) ColumnarExchange Input [2]: [o_year#X, mkt_share#X] -Arguments: rangepartitioning(o_year#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] +Arguments: rangepartitioning(o_year#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [shuffle_writer_type=hash] (81) ShuffleQueryStage Output [2]: [o_year#X, mkt_share#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark33/9.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark33/9.txt index b5ed852a37e44..d15efa6a65675 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark33/9.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark33/9.txt @@ -314,7 +314,7 @@ Arguments: X, X (53) ColumnarExchange Input [5]: [hash_partition_key#X, nation#X, o_year#X, sum#X, isEmpty#X] -Arguments: hashpartitioning(nation#X, o_year#X, 1), ENSURE_REQUIREMENTS, [nation#X, o_year#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(nation#X, o_year#X, 1), ENSURE_REQUIREMENTS, [nation#X, o_year#X, sum#X, isEmpty#X], [plan_id=X], [shuffle_writer_type=hash] (54) ShuffleQueryStage Output [4]: [nation#X, o_year#X, sum#X, isEmpty#X] @@ -343,7 +343,7 @@ Arguments: X, X (60) ColumnarExchange Input [3]: [nation#X, o_year#X, sum_profit#X] -Arguments: rangepartitioning(nation#X ASC NULLS FIRST, o_year#X DESC NULLS LAST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] +Arguments: rangepartitioning(nation#X ASC NULLS FIRST, o_year#X DESC NULLS LAST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [shuffle_writer_type=hash] (61) ShuffleQueryStage Output [3]: [nation#X, o_year#X, sum_profit#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark34/1.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark34/1.txt index 0c773785eafce..450797d3aefd9 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark34/1.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark34/1.txt @@ -64,7 +64,7 @@ Arguments: X, X (8) ColumnarExchange Input [18]: [hash_partition_key#X, l_returnflag#X, l_linestatus#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, count#X, sum#X, count#X, sum#X, count#X, count#X] -Arguments: hashpartitioning(l_returnflag#X, l_linestatus#X, 1), ENSURE_REQUIREMENTS, [l_returnflag#X, l_linestatus#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, count#X, sum#X, count#X, sum#X, count#X, count#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(l_returnflag#X, l_linestatus#X, 1), ENSURE_REQUIREMENTS, [l_returnflag#X, l_linestatus#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, count#X, sum#X, count#X, sum#X, count#X, count#X], [plan_id=X], [shuffle_writer_type=hash] (9) ShuffleQueryStage Output [17]: [l_returnflag#X, l_linestatus#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, count#X, sum#X, count#X, sum#X, count#X, count#X] @@ -93,7 +93,7 @@ Arguments: X, X (15) ColumnarExchange Input [10]: [l_returnflag#X, l_linestatus#X, sum_qty#X, sum_base_price#X, sum_disc_price#X, sum_charge#X, avg_qty#X, avg_price#X, avg_disc#X, count_order#X] -Arguments: rangepartitioning(l_returnflag#X ASC NULLS FIRST, l_linestatus#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] +Arguments: rangepartitioning(l_returnflag#X ASC NULLS FIRST, l_linestatus#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [shuffle_writer_type=hash] (16) ShuffleQueryStage Output [10]: [l_returnflag#X, l_linestatus#X, sum_qty#X, sum_base_price#X, sum_disc_price#X, sum_charge#X, avg_qty#X, avg_price#X, avg_disc#X, count_order#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark34/10.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark34/10.txt index c4f94275a4408..7a2404c1d6102 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark34/10.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark34/10.txt @@ -219,7 +219,7 @@ Arguments: X, X (36) ColumnarExchange Input [10]: [hash_partition_key#X, c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, sum#X, isEmpty#X] -Arguments: hashpartitioning(c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, sum#X, isEmpty#X], [plan_id=X], [shuffle_writer_type=hash] (37) ShuffleQueryStage Output [9]: [c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, sum#X, isEmpty#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark34/11.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark34/11.txt index cb4bff2a0ea33..2bf02786562ed 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark34/11.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark34/11.txt @@ -168,7 +168,7 @@ Arguments: X, X (26) ColumnarExchange Input [4]: [hash_partition_key#X, ps_partkey#X, sum#X, isEmpty#X] -Arguments: hashpartitioning(ps_partkey#X, 1), ENSURE_REQUIREMENTS, [ps_partkey#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(ps_partkey#X, 1), ENSURE_REQUIREMENTS, [ps_partkey#X, sum#X, isEmpty#X], [plan_id=X], [shuffle_writer_type=hash] (27) ShuffleQueryStage Output [3]: [ps_partkey#X, sum#X, isEmpty#X] @@ -201,7 +201,7 @@ Arguments: X, X (34) ColumnarExchange Input [2]: [ps_partkey#X, value#X] -Arguments: rangepartitioning(value#X DESC NULLS LAST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] +Arguments: rangepartitioning(value#X DESC NULLS LAST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [shuffle_writer_type=hash] (35) ShuffleQueryStage Output [2]: [ps_partkey#X, value#X] @@ -441,7 +441,7 @@ Arguments: X, X (78) ColumnarExchange Input [2]: [sum#X, isEmpty#X] -Arguments: SinglePartition, ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] +Arguments: SinglePartition, ENSURE_REQUIREMENTS, [plan_id=X], [shuffle_writer_type=hash] (79) ShuffleQueryStage Output [2]: [sum#X, isEmpty#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark34/12.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark34/12.txt index 099fdbce656a3..b796cfae26651 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark34/12.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark34/12.txt @@ -115,7 +115,7 @@ Arguments: X, X (17) ColumnarExchange Input [4]: [hash_partition_key#X, l_shipmode#X, sum#X, sum#X] -Arguments: hashpartitioning(l_shipmode#X, 1), ENSURE_REQUIREMENTS, [l_shipmode#X, sum#X, sum#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(l_shipmode#X, 1), ENSURE_REQUIREMENTS, [l_shipmode#X, sum#X, sum#X], [plan_id=X], [shuffle_writer_type=hash] (18) ShuffleQueryStage Output [3]: [l_shipmode#X, sum#X, sum#X] @@ -144,7 +144,7 @@ Arguments: X, X (24) ColumnarExchange Input [3]: [l_shipmode#X, high_line_count#X, low_line_count#X] -Arguments: rangepartitioning(l_shipmode#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] +Arguments: rangepartitioning(l_shipmode#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [shuffle_writer_type=hash] (25) ShuffleQueryStage Output [3]: [l_shipmode#X, high_line_count#X, low_line_count#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark34/13.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark34/13.txt index 3a2fb9b787603..3b7ac85bf23d1 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark34/13.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark34/13.txt @@ -119,7 +119,7 @@ Arguments: X, X (16) ColumnarExchange Input [3]: [hash_partition_key#X, c_custkey#X, count#X] -Arguments: hashpartitioning(c_custkey#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X, count#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(c_custkey#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X, count#X], [plan_id=X], [shuffle_writer_type=hash] (17) ShuffleQueryStage Output [2]: [c_custkey#X, count#X] @@ -163,7 +163,7 @@ Arguments: X, X (26) ColumnarExchange Input [3]: [hash_partition_key#X, c_count#X, count#X] -Arguments: hashpartitioning(c_count#X, 1), ENSURE_REQUIREMENTS, [c_count#X, count#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(c_count#X, 1), ENSURE_REQUIREMENTS, [c_count#X, count#X], [plan_id=X], [shuffle_writer_type=hash] (27) ShuffleQueryStage Output [2]: [c_count#X, count#X] @@ -192,7 +192,7 @@ Arguments: X, X (33) ColumnarExchange Input [2]: [c_count#X, custdist#X] -Arguments: rangepartitioning(custdist#X DESC NULLS LAST, c_count#X DESC NULLS LAST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] +Arguments: rangepartitioning(custdist#X DESC NULLS LAST, c_count#X DESC NULLS LAST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [shuffle_writer_type=hash] (34) ShuffleQueryStage Output [2]: [c_count#X, custdist#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark34/14.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark34/14.txt index ed116936e3e2f..7f4949cf72887 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark34/14.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark34/14.txt @@ -104,7 +104,7 @@ Arguments: X, X (16) ColumnarExchange Input [4]: [sum#X, isEmpty#X, sum#X, isEmpty#X] -Arguments: SinglePartition, ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] +Arguments: SinglePartition, ENSURE_REQUIREMENTS, [plan_id=X], [shuffle_writer_type=hash] (17) ShuffleQueryStage Output [4]: [sum#X, isEmpty#X, sum#X, isEmpty#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark34/15.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark34/15.txt index e7ac9b2efce16..5fa837e185f07 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark34/15.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark34/15.txt @@ -106,7 +106,7 @@ Arguments: X, X (15) ColumnarExchange Input [4]: [hash_partition_key#X, l_suppkey#X, sum#X, isEmpty#X] -Arguments: hashpartitioning(l_suppkey#X, 1), ENSURE_REQUIREMENTS, [l_suppkey#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(l_suppkey#X, 1), ENSURE_REQUIREMENTS, [l_suppkey#X, sum#X, isEmpty#X], [plan_id=X], [shuffle_writer_type=hash] (16) ShuffleQueryStage Output [3]: [l_suppkey#X, sum#X, isEmpty#X] @@ -149,7 +149,7 @@ Arguments: X, X (25) ColumnarExchange Input [5]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X, total_revenue#X] -Arguments: rangepartitioning(s_suppkey#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] +Arguments: rangepartitioning(s_suppkey#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [shuffle_writer_type=hash] (26) ShuffleQueryStage Output [5]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X, total_revenue#X] @@ -302,7 +302,7 @@ Arguments: X, X (51) ColumnarExchange Input [4]: [hash_partition_key#X, l_suppkey#X, sum#X, isEmpty#X] -Arguments: hashpartitioning(l_suppkey#X, 1), ENSURE_REQUIREMENTS, [l_suppkey#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(l_suppkey#X, 1), ENSURE_REQUIREMENTS, [l_suppkey#X, sum#X, isEmpty#X], [plan_id=X], [shuffle_writer_type=hash] (52) ShuffleQueryStage Output [3]: [l_suppkey#X, sum#X, isEmpty#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark34/16.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark34/16.txt index 2cc727f9782b6..a6302f1beeb50 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark34/16.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark34/16.txt @@ -124,7 +124,7 @@ Arguments: X, X (16) ColumnarExchange Input [5]: [hash_partition_key#X, p_brand#X, p_type#X, p_size#X, ps_suppkey#X] -Arguments: hashpartitioning(p_brand#X, p_type#X, p_size#X, ps_suppkey#X, 1), ENSURE_REQUIREMENTS, [p_brand#X, p_type#X, p_size#X, ps_suppkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(p_brand#X, p_type#X, p_size#X, ps_suppkey#X, 1), ENSURE_REQUIREMENTS, [p_brand#X, p_type#X, p_size#X, ps_suppkey#X], [plan_id=X], [shuffle_writer_type=hash] (17) ShuffleQueryStage Output [4]: [p_brand#X, p_type#X, p_size#X, ps_suppkey#X] @@ -164,7 +164,7 @@ Arguments: X, X (25) ColumnarExchange Input [5]: [hash_partition_key#X, p_brand#X, p_type#X, p_size#X, count#X] -Arguments: hashpartitioning(p_brand#X, p_type#X, p_size#X, 1), ENSURE_REQUIREMENTS, [p_brand#X, p_type#X, p_size#X, count#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(p_brand#X, p_type#X, p_size#X, 1), ENSURE_REQUIREMENTS, [p_brand#X, p_type#X, p_size#X, count#X], [plan_id=X], [shuffle_writer_type=hash] (26) ShuffleQueryStage Output [4]: [p_brand#X, p_type#X, p_size#X, count#X] @@ -193,7 +193,7 @@ Arguments: X, X (32) ColumnarExchange Input [4]: [p_brand#X, p_type#X, p_size#X, supplier_cnt#X] -Arguments: rangepartitioning(supplier_cnt#X DESC NULLS LAST, p_brand#X ASC NULLS FIRST, p_type#X ASC NULLS FIRST, p_size#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] +Arguments: rangepartitioning(supplier_cnt#X DESC NULLS LAST, p_brand#X ASC NULLS FIRST, p_type#X ASC NULLS FIRST, p_size#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [shuffle_writer_type=hash] (33) ShuffleQueryStage Output [4]: [p_brand#X, p_type#X, p_size#X, supplier_cnt#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark34/17.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark34/17.txt index 77c18798faa24..f2b7262c75969 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark34/17.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark34/17.txt @@ -65,7 +65,7 @@ Arguments: X, X (8) ColumnarExchange Input [2]: [sum#X, isEmpty#X] -Arguments: SinglePartition, ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] +Arguments: SinglePartition, ENSURE_REQUIREMENTS, [plan_id=X], [shuffle_writer_type=hash] (9) ShuffleQueryStage Output [2]: [sum#X, isEmpty#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark34/18.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark34/18.txt index 88e9352841ac7..029c44b1547f8 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark34/18.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark34/18.txt @@ -146,7 +146,7 @@ Arguments: X, X (15) ColumnarExchange Input [4]: [hash_partition_key#X, l_orderkey#X, sum#X, isEmpty#X] -Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, sum#X, isEmpty#X], [plan_id=X], [shuffle_writer_type=hash] (16) ShuffleQueryStage Output [3]: [l_orderkey#X, sum#X, isEmpty#X] @@ -286,7 +286,7 @@ Arguments: X, X (48) ColumnarExchange Input [8]: [hash_partition_key#X, c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum#X, isEmpty#X] -Arguments: hashpartitioning(c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, 1), ENSURE_REQUIREMENTS, [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, 1), ENSURE_REQUIREMENTS, [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum#X, isEmpty#X], [plan_id=X], [shuffle_writer_type=hash] (49) ShuffleQueryStage Output [7]: [c_name#X, c_custkey#X, o_orderkey#X, o_orderdate#X, o_totalprice#X, sum#X, isEmpty#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark34/19.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark34/19.txt index 01b07807e5576..069500cf6bdfe 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark34/19.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark34/19.txt @@ -103,7 +103,7 @@ Arguments: X, X (16) ColumnarExchange Input [2]: [sum#X, isEmpty#X] -Arguments: SinglePartition, ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] +Arguments: SinglePartition, ENSURE_REQUIREMENTS, [plan_id=X], [shuffle_writer_type=hash] (17) ShuffleQueryStage Output [2]: [sum#X, isEmpty#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark34/20.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark34/20.txt index f4eea85b7b425..3e2b19835441d 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark34/20.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark34/20.txt @@ -223,7 +223,7 @@ Arguments: X, X (31) ColumnarExchange Input [5]: [hash_partition_key#X, l_partkey#X, l_suppkey#X, sum#X, isEmpty#X] -Arguments: hashpartitioning(l_partkey#X, l_suppkey#X, 1), ENSURE_REQUIREMENTS, [l_partkey#X, l_suppkey#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(l_partkey#X, l_suppkey#X, 1), ENSURE_REQUIREMENTS, [l_partkey#X, l_suppkey#X, sum#X, isEmpty#X], [plan_id=X], [shuffle_writer_type=hash] (32) ShuffleQueryStage Output [4]: [l_partkey#X, l_suppkey#X, sum#X, isEmpty#X] @@ -341,7 +341,7 @@ Arguments: X, X (59) ColumnarExchange Input [2]: [s_name#X, s_address#X] -Arguments: rangepartitioning(s_name#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] +Arguments: rangepartitioning(s_name#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [shuffle_writer_type=hash] (60) ShuffleQueryStage Output [2]: [s_name#X, s_address#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark34/21.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark34/21.txt index ae73b6fd04232..c93cb695dbd94 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark34/21.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark34/21.txt @@ -312,7 +312,7 @@ Arguments: X, X (53) ColumnarExchange Input [3]: [hash_partition_key#X, s_name#X, count#X] -Arguments: hashpartitioning(s_name#X, 1), ENSURE_REQUIREMENTS, [s_name#X, count#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(s_name#X, 1), ENSURE_REQUIREMENTS, [s_name#X, count#X], [plan_id=X], [shuffle_writer_type=hash] (54) ShuffleQueryStage Output [2]: [s_name#X, count#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark34/22.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark34/22.txt index 5526710f89e17..b5450682267eb 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark34/22.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark34/22.txt @@ -102,7 +102,7 @@ Arguments: X, X (15) ColumnarExchange Input [5]: [hash_partition_key#X, cntrycode#X, count#X, sum#X, isEmpty#X] -Arguments: hashpartitioning(cntrycode#X, 1), ENSURE_REQUIREMENTS, [cntrycode#X, count#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(cntrycode#X, 1), ENSURE_REQUIREMENTS, [cntrycode#X, count#X, sum#X, isEmpty#X], [plan_id=X], [shuffle_writer_type=hash] (16) ShuffleQueryStage Output [4]: [cntrycode#X, count#X, sum#X, isEmpty#X] @@ -131,7 +131,7 @@ Arguments: X, X (22) ColumnarExchange Input [3]: [cntrycode#X, numcust#X, totacctbal#X] -Arguments: rangepartitioning(cntrycode#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] +Arguments: rangepartitioning(cntrycode#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [shuffle_writer_type=hash] (23) ShuffleQueryStage Output [3]: [cntrycode#X, numcust#X, totacctbal#X] @@ -271,7 +271,7 @@ Arguments: X, X (47) ColumnarExchange Input [2]: [sum#X, count#X] -Arguments: SinglePartition, ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] +Arguments: SinglePartition, ENSURE_REQUIREMENTS, [plan_id=X], [shuffle_writer_type=hash] (48) ShuffleQueryStage Output [2]: [sum#X, count#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark34/3.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark34/3.txt index 4a37e3343dfac..a1635e742a6b1 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark34/3.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark34/3.txt @@ -168,7 +168,7 @@ Arguments: X, X (27) ColumnarExchange Input [6]: [hash_partition_key#X, l_orderkey#X, o_orderdate#X, o_shippriority#X, sum#X, isEmpty#X] -Arguments: hashpartitioning(l_orderkey#X, o_orderdate#X, o_shippriority#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, o_orderdate#X, o_shippriority#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(l_orderkey#X, o_orderdate#X, o_shippriority#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, o_orderdate#X, o_shippriority#X, sum#X, isEmpty#X], [plan_id=X], [shuffle_writer_type=hash] (28) ShuffleQueryStage Output [5]: [l_orderkey#X, o_orderdate#X, o_shippriority#X, sum#X, isEmpty#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark34/4.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark34/4.txt index 0bff0b16383f5..4a26f959b7bbd 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark34/4.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark34/4.txt @@ -121,7 +121,7 @@ Arguments: X, X (18) ColumnarExchange Input [3]: [hash_partition_key#X, o_orderpriority#X, count#X] -Arguments: hashpartitioning(o_orderpriority#X, 1), ENSURE_REQUIREMENTS, [o_orderpriority#X, count#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(o_orderpriority#X, 1), ENSURE_REQUIREMENTS, [o_orderpriority#X, count#X], [plan_id=X], [shuffle_writer_type=hash] (19) ShuffleQueryStage Output [2]: [o_orderpriority#X, count#X] @@ -150,7 +150,7 @@ Arguments: X, X (25) ColumnarExchange Input [2]: [o_orderpriority#X, order_count#X] -Arguments: rangepartitioning(o_orderpriority#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] +Arguments: rangepartitioning(o_orderpriority#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [shuffle_writer_type=hash] (26) ShuffleQueryStage Output [2]: [o_orderpriority#X, order_count#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark34/5.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark34/5.txt index ff4a7828c4b4e..3ba742ae5b6ac 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark34/5.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark34/5.txt @@ -325,7 +325,7 @@ Arguments: X, X (54) ColumnarExchange Input [4]: [hash_partition_key#X, n_name#X, sum#X, isEmpty#X] -Arguments: hashpartitioning(n_name#X, 1), ENSURE_REQUIREMENTS, [n_name#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(n_name#X, 1), ENSURE_REQUIREMENTS, [n_name#X, sum#X, isEmpty#X], [plan_id=X], [shuffle_writer_type=hash] (55) ShuffleQueryStage Output [3]: [n_name#X, sum#X, isEmpty#X] @@ -354,7 +354,7 @@ Arguments: X, X (61) ColumnarExchange Input [2]: [n_name#X, revenue#X] -Arguments: rangepartitioning(revenue#X DESC NULLS LAST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] +Arguments: rangepartitioning(revenue#X DESC NULLS LAST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [shuffle_writer_type=hash] (62) ShuffleQueryStage Output [2]: [n_name#X, revenue#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark34/6.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark34/6.txt index 9463fa1da9d93..fa37c656a1aac 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark34/6.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark34/6.txt @@ -52,7 +52,7 @@ Arguments: X, X (7) ColumnarExchange Input [2]: [sum#X, isEmpty#X] -Arguments: SinglePartition, ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] +Arguments: SinglePartition, ENSURE_REQUIREMENTS, [plan_id=X], [shuffle_writer_type=hash] (8) ShuffleQueryStage Output [2]: [sum#X, isEmpty#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark34/7.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark34/7.txt index 0925850d92375..ab8abd6df3640 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark34/7.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark34/7.txt @@ -295,7 +295,7 @@ Arguments: X, X (49) ColumnarExchange Input [6]: [hash_partition_key#X, supp_nation#X, cust_nation#X, l_year#X, sum#X, isEmpty#X] -Arguments: hashpartitioning(supp_nation#X, cust_nation#X, l_year#X, 1), ENSURE_REQUIREMENTS, [supp_nation#X, cust_nation#X, l_year#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(supp_nation#X, cust_nation#X, l_year#X, 1), ENSURE_REQUIREMENTS, [supp_nation#X, cust_nation#X, l_year#X, sum#X, isEmpty#X], [plan_id=X], [shuffle_writer_type=hash] (50) ShuffleQueryStage Output [5]: [supp_nation#X, cust_nation#X, l_year#X, sum#X, isEmpty#X] @@ -324,7 +324,7 @@ Arguments: X, X (56) ColumnarExchange Input [4]: [supp_nation#X, cust_nation#X, l_year#X, revenue#X] -Arguments: rangepartitioning(supp_nation#X ASC NULLS FIRST, cust_nation#X ASC NULLS FIRST, l_year#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] +Arguments: rangepartitioning(supp_nation#X ASC NULLS FIRST, cust_nation#X ASC NULLS FIRST, l_year#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [shuffle_writer_type=hash] (57) ShuffleQueryStage Output [4]: [supp_nation#X, cust_nation#X, l_year#X, revenue#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark34/8.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark34/8.txt index 56fd86d255b44..e075a7b338855 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark34/8.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark34/8.txt @@ -428,7 +428,7 @@ Arguments: X, X (72) ColumnarExchange Input [6]: [hash_partition_key#X, o_year#X, sum#X, isEmpty#X, sum#X, isEmpty#X] -Arguments: hashpartitioning(o_year#X, 1), ENSURE_REQUIREMENTS, [o_year#X, sum#X, isEmpty#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(o_year#X, 1), ENSURE_REQUIREMENTS, [o_year#X, sum#X, isEmpty#X, sum#X, isEmpty#X], [plan_id=X], [shuffle_writer_type=hash] (73) ShuffleQueryStage Output [5]: [o_year#X, sum#X, isEmpty#X, sum#X, isEmpty#X] @@ -461,7 +461,7 @@ Arguments: X, X (80) ColumnarExchange Input [2]: [o_year#X, mkt_share#X] -Arguments: rangepartitioning(o_year#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] +Arguments: rangepartitioning(o_year#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [shuffle_writer_type=hash] (81) ShuffleQueryStage Output [2]: [o_year#X, mkt_share#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark34/9.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark34/9.txt index 1f25540dd1363..8ba39a425d25c 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark34/9.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-bhj/spark34/9.txt @@ -319,7 +319,7 @@ Arguments: X, X (53) ColumnarExchange Input [5]: [hash_partition_key#X, nation#X, o_year#X, sum#X, isEmpty#X] -Arguments: hashpartitioning(nation#X, o_year#X, 1), ENSURE_REQUIREMENTS, [nation#X, o_year#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(nation#X, o_year#X, 1), ENSURE_REQUIREMENTS, [nation#X, o_year#X, sum#X, isEmpty#X], [plan_id=X], [shuffle_writer_type=hash] (54) ShuffleQueryStage Output [4]: [nation#X, o_year#X, sum#X, isEmpty#X] @@ -348,7 +348,7 @@ Arguments: X, X (60) ColumnarExchange Input [3]: [nation#X, o_year#X, sum_profit#X] -Arguments: rangepartitioning(nation#X ASC NULLS FIRST, o_year#X DESC NULLS LAST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] +Arguments: rangepartitioning(nation#X ASC NULLS FIRST, o_year#X DESC NULLS LAST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [shuffle_writer_type=hash] (61) ShuffleQueryStage Output [3]: [nation#X, o_year#X, sum_profit#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark32/1.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark32/1.txt index deb09c6c1c0f6..37622c9e13a75 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark32/1.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark32/1.txt @@ -64,7 +64,7 @@ Arguments: X, X (8) ColumnarExchange Input [18]: [hash_partition_key#X, l_returnflag#X, l_linestatus#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, count#X, sum#X, count#X, sum#X, count#X, count#X] -Arguments: hashpartitioning(l_returnflag#X, l_linestatus#X, 1), ENSURE_REQUIREMENTS, [l_returnflag#X, l_linestatus#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, count#X, sum#X, count#X, sum#X, count#X, count#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(l_returnflag#X, l_linestatus#X, 1), ENSURE_REQUIREMENTS, [l_returnflag#X, l_linestatus#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, count#X, sum#X, count#X, sum#X, count#X, count#X], [plan_id=X], [shuffle_writer_type=hash] (9) ShuffleQueryStage Output [17]: [l_returnflag#X, l_linestatus#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, count#X, sum#X, count#X, sum#X, count#X, count#X] @@ -93,7 +93,7 @@ Arguments: X, X (15) ColumnarExchange Input [10]: [l_returnflag#X, l_linestatus#X, sum_qty#X, sum_base_price#X, sum_disc_price#X, sum_charge#X, avg_qty#X, avg_price#X, avg_disc#X, count_order#X] -Arguments: rangepartitioning(l_returnflag#X ASC NULLS FIRST, l_linestatus#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] +Arguments: rangepartitioning(l_returnflag#X ASC NULLS FIRST, l_linestatus#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [shuffle_writer_type=hash] (16) ShuffleQueryStage Output [10]: [l_returnflag#X, l_linestatus#X, sum_qty#X, sum_base_price#X, sum_disc_price#X, sum_charge#X, avg_qty#X, avg_price#X, avg_disc#X, count_order#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark32/10.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark32/10.txt index 49e18f05aa63f..0f902d3da7c7a 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark32/10.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark32/10.txt @@ -113,7 +113,7 @@ Arguments: X, X (6) ColumnarExchange Input [8]: [hash_partition_key#X, c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X] -Arguments: hashpartitioning(c_custkey#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(c_custkey#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X], [plan_id=X], [shuffle_writer_type=hash] (7) ShuffleQueryStage Output [7]: [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X] @@ -150,7 +150,7 @@ Arguments: X, X (15) ColumnarExchange Input [3]: [hash_partition_key#X, o_orderkey#X, o_custkey#X] -Arguments: hashpartitioning(o_custkey#X, 1), ENSURE_REQUIREMENTS, [o_orderkey#X, o_custkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(o_custkey#X, 1), ENSURE_REQUIREMENTS, [o_orderkey#X, o_custkey#X], [plan_id=X], [shuffle_writer_type=hash] (16) ShuffleQueryStage Output [2]: [o_orderkey#X, o_custkey#X] @@ -181,7 +181,7 @@ Arguments: X, X (23) ColumnarExchange Input [9]: [hash_partition_key#X, c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, o_orderkey#X] -Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, o_orderkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, o_orderkey#X], [plan_id=X], [shuffle_writer_type=hash] (24) ShuffleQueryStage Output [8]: [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, o_orderkey#X] @@ -218,7 +218,7 @@ Arguments: X, X (32) ColumnarExchange Input [4]: [hash_partition_key#X, l_orderkey#X, l_extendedprice#X, l_discount#X] -Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_extendedprice#X, l_discount#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_extendedprice#X, l_discount#X], [plan_id=X], [shuffle_writer_type=hash] (33) ShuffleQueryStage Output [3]: [l_orderkey#X, l_extendedprice#X, l_discount#X] @@ -249,7 +249,7 @@ Arguments: X, X (40) ColumnarExchange Input [10]: [hash_partition_key#X, c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, l_extendedprice#X, l_discount#X] -Arguments: hashpartitioning(c_nationkey#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, l_extendedprice#X, l_discount#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(c_nationkey#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, l_extendedprice#X, l_discount#X], [plan_id=X], [shuffle_writer_type=hash] (41) ShuffleQueryStage Output [9]: [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, l_extendedprice#X, l_discount#X] @@ -286,7 +286,7 @@ Arguments: X, X (49) ColumnarExchange Input [3]: [hash_partition_key#X, n_nationkey#X, n_name#X] -Arguments: hashpartitioning(n_nationkey#X, 1), ENSURE_REQUIREMENTS, [n_nationkey#X, n_name#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(n_nationkey#X, 1), ENSURE_REQUIREMENTS, [n_nationkey#X, n_name#X], [plan_id=X], [shuffle_writer_type=hash] (50) ShuffleQueryStage Output [2]: [n_nationkey#X, n_name#X] @@ -328,7 +328,7 @@ Arguments: X, X (59) ColumnarExchange Input [10]: [hash_partition_key#X, c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, sum#X, isEmpty#X] -Arguments: hashpartitioning(c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, sum#X, isEmpty#X], [plan_id=X], [shuffle_writer_type=hash] (60) ShuffleQueryStage Output [9]: [c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, sum#X, isEmpty#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark32/11.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark32/11.txt index 7aff321e6ae69..72d04de069706 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark32/11.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark32/11.txt @@ -97,7 +97,7 @@ Arguments: X, X (6) ColumnarExchange Input [5]: [hash_partition_key#X, ps_partkey#X, ps_suppkey#X, ps_availqty#X, ps_supplycost#X] -Arguments: hashpartitioning(ps_suppkey#X, 1), ENSURE_REQUIREMENTS, [ps_partkey#X, ps_suppkey#X, ps_availqty#X, ps_supplycost#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(ps_suppkey#X, 1), ENSURE_REQUIREMENTS, [ps_partkey#X, ps_suppkey#X, ps_availqty#X, ps_supplycost#X], [plan_id=X], [shuffle_writer_type=hash] (7) ShuffleQueryStage Output [4]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X, ps_supplycost#X] @@ -134,7 +134,7 @@ Arguments: X, X (15) ColumnarExchange Input [3]: [hash_partition_key#X, s_suppkey#X, s_nationkey#X] -Arguments: hashpartitioning(s_suppkey#X, 1), ENSURE_REQUIREMENTS, [s_suppkey#X, s_nationkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(s_suppkey#X, 1), ENSURE_REQUIREMENTS, [s_suppkey#X, s_nationkey#X], [plan_id=X], [shuffle_writer_type=hash] (16) ShuffleQueryStage Output [2]: [s_suppkey#X, s_nationkey#X] @@ -165,7 +165,7 @@ Arguments: X, X (23) ColumnarExchange Input [5]: [hash_partition_key#X, ps_partkey#X, ps_availqty#X, ps_supplycost#X, s_nationkey#X] -Arguments: hashpartitioning(s_nationkey#X, 1), ENSURE_REQUIREMENTS, [ps_partkey#X, ps_availqty#X, ps_supplycost#X, s_nationkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(s_nationkey#X, 1), ENSURE_REQUIREMENTS, [ps_partkey#X, ps_availqty#X, ps_supplycost#X, s_nationkey#X], [plan_id=X], [shuffle_writer_type=hash] (24) ShuffleQueryStage Output [4]: [ps_partkey#X, ps_availqty#X, ps_supplycost#X, s_nationkey#X] @@ -202,7 +202,7 @@ Arguments: X, X (32) ColumnarExchange Input [2]: [hash_partition_key#X, n_nationkey#X] -Arguments: hashpartitioning(n_nationkey#X, 1), ENSURE_REQUIREMENTS, [n_nationkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(n_nationkey#X, 1), ENSURE_REQUIREMENTS, [n_nationkey#X], [plan_id=X], [shuffle_writer_type=hash] (33) ShuffleQueryStage Output [1]: [n_nationkey#X] @@ -244,7 +244,7 @@ Arguments: X, X (42) ColumnarExchange Input [4]: [hash_partition_key#X, ps_partkey#X, sum#X, isEmpty#X] -Arguments: hashpartitioning(ps_partkey#X, 1), ENSURE_REQUIREMENTS, [ps_partkey#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(ps_partkey#X, 1), ENSURE_REQUIREMENTS, [ps_partkey#X, sum#X, isEmpty#X], [plan_id=X], [shuffle_writer_type=hash] (43) ShuffleQueryStage Output [3]: [ps_partkey#X, sum#X, isEmpty#X] @@ -277,7 +277,7 @@ Arguments: X, X (50) ColumnarExchange Input [2]: [ps_partkey#X, value#X] -Arguments: rangepartitioning(value#X DESC NULLS LAST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] +Arguments: rangepartitioning(value#X DESC NULLS LAST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [shuffle_writer_type=hash] (51) ShuffleQueryStage Output [2]: [ps_partkey#X, value#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark32/12.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark32/12.txt index 0ac613d3792b5..b9d0cd3457fd8 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark32/12.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark32/12.txt @@ -74,7 +74,7 @@ Arguments: X, X (6) ColumnarExchange Input [3]: [hash_partition_key#X, o_orderkey#X, o_orderpriority#X] -Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [o_orderkey#X, o_orderpriority#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [o_orderkey#X, o_orderpriority#X], [plan_id=X], [shuffle_writer_type=hash] (7) ShuffleQueryStage Output [2]: [o_orderkey#X, o_orderpriority#X] @@ -111,7 +111,7 @@ Arguments: X, X (15) ColumnarExchange Input [3]: [hash_partition_key#X, l_orderkey#X, l_shipmode#X] -Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_shipmode#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_shipmode#X], [plan_id=X], [shuffle_writer_type=hash] (16) ShuffleQueryStage Output [2]: [l_orderkey#X, l_shipmode#X] @@ -153,7 +153,7 @@ Arguments: X, X (25) ColumnarExchange Input [4]: [hash_partition_key#X, l_shipmode#X, sum#X, sum#X] -Arguments: hashpartitioning(l_shipmode#X, 1), ENSURE_REQUIREMENTS, [l_shipmode#X, sum#X, sum#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(l_shipmode#X, 1), ENSURE_REQUIREMENTS, [l_shipmode#X, sum#X, sum#X], [plan_id=X], [shuffle_writer_type=hash] (26) ShuffleQueryStage Output [3]: [l_shipmode#X, sum#X, sum#X] @@ -182,7 +182,7 @@ Arguments: X, X (32) ColumnarExchange Input [3]: [l_shipmode#X, high_line_count#X, low_line_count#X] -Arguments: rangepartitioning(l_shipmode#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] +Arguments: rangepartitioning(l_shipmode#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [shuffle_writer_type=hash] (33) ShuffleQueryStage Output [3]: [l_shipmode#X, high_line_count#X, low_line_count#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark32/13.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark32/13.txt index de18e59ae904b..af0732c1391a6 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark32/13.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark32/13.txt @@ -72,7 +72,7 @@ Arguments: X, X (5) ColumnarExchange Input [2]: [hash_partition_key#X, c_custkey#X] -Arguments: hashpartitioning(c_custkey#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(c_custkey#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X], [plan_id=X], [shuffle_writer_type=hash] (6) ShuffleQueryStage Output [1]: [c_custkey#X] @@ -109,7 +109,7 @@ Arguments: X, X (14) ColumnarExchange Input [3]: [hash_partition_key#X, o_orderkey#X, o_custkey#X] -Arguments: hashpartitioning(o_custkey#X, 1), ENSURE_REQUIREMENTS, [o_orderkey#X, o_custkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(o_custkey#X, 1), ENSURE_REQUIREMENTS, [o_orderkey#X, o_custkey#X], [plan_id=X], [shuffle_writer_type=hash] (15) ShuffleQueryStage Output [2]: [o_orderkey#X, o_custkey#X] @@ -169,7 +169,7 @@ Arguments: X, X (27) ColumnarExchange Input [3]: [hash_partition_key#X, c_count#X, count#X] -Arguments: hashpartitioning(c_count#X, 1), ENSURE_REQUIREMENTS, [c_count#X, count#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(c_count#X, 1), ENSURE_REQUIREMENTS, [c_count#X, count#X], [plan_id=X], [shuffle_writer_type=hash] (28) ShuffleQueryStage Output [2]: [c_count#X, count#X] @@ -198,7 +198,7 @@ Arguments: X, X (34) ColumnarExchange Input [2]: [c_count#X, custdist#X] -Arguments: rangepartitioning(custdist#X DESC NULLS LAST, c_count#X DESC NULLS LAST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] +Arguments: rangepartitioning(custdist#X DESC NULLS LAST, c_count#X DESC NULLS LAST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [shuffle_writer_type=hash] (35) ShuffleQueryStage Output [2]: [c_count#X, custdist#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark32/14.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark32/14.txt index 66c324638566c..a5270d5dc57a6 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark32/14.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark32/14.txt @@ -62,7 +62,7 @@ Arguments: X, X (6) ColumnarExchange Input [4]: [hash_partition_key#X, l_partkey#X, l_extendedprice#X, l_discount#X] -Arguments: hashpartitioning(l_partkey#X, 1), ENSURE_REQUIREMENTS, [l_partkey#X, l_extendedprice#X, l_discount#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(l_partkey#X, 1), ENSURE_REQUIREMENTS, [l_partkey#X, l_extendedprice#X, l_discount#X], [plan_id=X], [shuffle_writer_type=hash] (7) ShuffleQueryStage Output [3]: [l_partkey#X, l_extendedprice#X, l_discount#X] @@ -99,7 +99,7 @@ Arguments: X, X (15) ColumnarExchange Input [3]: [hash_partition_key#X, p_partkey#X, p_type#X] -Arguments: hashpartitioning(p_partkey#X, 1), ENSURE_REQUIREMENTS, [p_partkey#X, p_type#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(p_partkey#X, 1), ENSURE_REQUIREMENTS, [p_partkey#X, p_type#X], [plan_id=X], [shuffle_writer_type=hash] (16) ShuffleQueryStage Output [2]: [p_partkey#X, p_type#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark32/15.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark32/15.txt index 3aac8d913ac72..e1ec46810d6b9 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark32/15.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark32/15.txt @@ -71,7 +71,7 @@ Arguments: X, X (6) ColumnarExchange Input [5]: [hash_partition_key#X, s_suppkey#X, s_name#X, s_address#X, s_phone#X] -Arguments: hashpartitioning(s_suppkey#X, 1), ENSURE_REQUIREMENTS, [s_suppkey#X, s_name#X, s_address#X, s_phone#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(s_suppkey#X, 1), ENSURE_REQUIREMENTS, [s_suppkey#X, s_name#X, s_address#X, s_phone#X], [plan_id=X], [shuffle_writer_type=hash] (7) ShuffleQueryStage Output [4]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X] @@ -119,7 +119,7 @@ Arguments: X, X (17) ColumnarExchange Input [4]: [hash_partition_key#X, l_suppkey#X, sum#X, isEmpty#X] -Arguments: hashpartitioning(l_suppkey#X, 1), ENSURE_REQUIREMENTS, [l_suppkey#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(l_suppkey#X, 1), ENSURE_REQUIREMENTS, [l_suppkey#X, sum#X, isEmpty#X], [plan_id=X], [shuffle_writer_type=hash] (18) ShuffleQueryStage Output [3]: [l_suppkey#X, sum#X, isEmpty#X] @@ -161,7 +161,7 @@ Arguments: X, X (27) ColumnarExchange Input [5]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X, total_revenue#X] -Arguments: rangepartitioning(s_suppkey#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] +Arguments: rangepartitioning(s_suppkey#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [shuffle_writer_type=hash] (28) ShuffleQueryStage Output [5]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X, total_revenue#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark32/16.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark32/16.txt index abf71575d8541..c4d8034425c84 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark32/16.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark32/16.txt @@ -88,7 +88,7 @@ Arguments: X, X (6) ColumnarExchange Input [3]: [hash_partition_key#X, ps_partkey#X, ps_suppkey#X] -Arguments: hashpartitioning(ps_partkey#X, 1), ENSURE_REQUIREMENTS, [ps_partkey#X, ps_suppkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(ps_partkey#X, 1), ENSURE_REQUIREMENTS, [ps_partkey#X, ps_suppkey#X], [plan_id=X], [shuffle_writer_type=hash] (7) ShuffleQueryStage Output [2]: [ps_partkey#X, ps_suppkey#X] @@ -125,7 +125,7 @@ Arguments: X, X (15) ColumnarExchange Input [5]: [hash_partition_key#X, p_partkey#X, p_brand#X, p_type#X, p_size#X] -Arguments: hashpartitioning(p_partkey#X, 1), ENSURE_REQUIREMENTS, [p_partkey#X, p_brand#X, p_type#X, p_size#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(p_partkey#X, 1), ENSURE_REQUIREMENTS, [p_partkey#X, p_brand#X, p_type#X, p_size#X], [plan_id=X], [shuffle_writer_type=hash] (16) ShuffleQueryStage Output [4]: [p_partkey#X, p_brand#X, p_type#X, p_size#X] @@ -167,7 +167,7 @@ Arguments: X, X (25) ColumnarExchange Input [5]: [hash_partition_key#X, p_brand#X, p_type#X, p_size#X, ps_suppkey#X] -Arguments: hashpartitioning(p_brand#X, p_type#X, p_size#X, ps_suppkey#X, 1), ENSURE_REQUIREMENTS, [p_brand#X, p_type#X, p_size#X, ps_suppkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(p_brand#X, p_type#X, p_size#X, ps_suppkey#X, 1), ENSURE_REQUIREMENTS, [p_brand#X, p_type#X, p_size#X, ps_suppkey#X], [plan_id=X], [shuffle_writer_type=hash] (26) ShuffleQueryStage Output [4]: [p_brand#X, p_type#X, p_size#X, ps_suppkey#X] @@ -207,7 +207,7 @@ Arguments: X, X (34) ColumnarExchange Input [5]: [hash_partition_key#X, p_brand#X, p_type#X, p_size#X, count#X] -Arguments: hashpartitioning(p_brand#X, p_type#X, p_size#X, 1), ENSURE_REQUIREMENTS, [p_brand#X, p_type#X, p_size#X, count#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(p_brand#X, p_type#X, p_size#X, 1), ENSURE_REQUIREMENTS, [p_brand#X, p_type#X, p_size#X, count#X], [plan_id=X], [shuffle_writer_type=hash] (35) ShuffleQueryStage Output [4]: [p_brand#X, p_type#X, p_size#X, count#X] @@ -236,7 +236,7 @@ Arguments: X, X (41) ColumnarExchange Input [4]: [p_brand#X, p_type#X, p_size#X, supplier_cnt#X] -Arguments: rangepartitioning(supplier_cnt#X DESC NULLS LAST, p_brand#X ASC NULLS FIRST, p_type#X ASC NULLS FIRST, p_size#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] +Arguments: rangepartitioning(supplier_cnt#X DESC NULLS LAST, p_brand#X ASC NULLS FIRST, p_type#X ASC NULLS FIRST, p_size#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [shuffle_writer_type=hash] (42) ShuffleQueryStage Output [4]: [p_brand#X, p_type#X, p_size#X, supplier_cnt#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark32/17.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark32/17.txt index b3b530a92eb6a..196539e2104af 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark32/17.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark32/17.txt @@ -84,7 +84,7 @@ Arguments: X, X (6) ColumnarExchange Input [4]: [hash_partition_key#X, l_partkey#X, l_quantity#X, l_extendedprice#X] -Arguments: hashpartitioning(l_partkey#X, 1), ENSURE_REQUIREMENTS, [l_partkey#X, l_quantity#X, l_extendedprice#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(l_partkey#X, 1), ENSURE_REQUIREMENTS, [l_partkey#X, l_quantity#X, l_extendedprice#X], [plan_id=X], [shuffle_writer_type=hash] (7) ShuffleQueryStage Output [3]: [l_partkey#X, l_quantity#X, l_extendedprice#X] @@ -121,7 +121,7 @@ Arguments: X, X (15) ColumnarExchange Input [2]: [hash_partition_key#X, p_partkey#X] -Arguments: hashpartitioning(p_partkey#X, 1), ENSURE_REQUIREMENTS, [p_partkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(p_partkey#X, 1), ENSURE_REQUIREMENTS, [p_partkey#X], [plan_id=X], [shuffle_writer_type=hash] (16) ShuffleQueryStage Output [1]: [p_partkey#X] @@ -174,7 +174,7 @@ Arguments: X, X (27) ColumnarExchange Input [4]: [hash_partition_key#X, l_partkey#X, sum#X, count#X] -Arguments: hashpartitioning(l_partkey#X, 1), ENSURE_REQUIREMENTS, [l_partkey#X, sum#X, count#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(l_partkey#X, 1), ENSURE_REQUIREMENTS, [l_partkey#X, sum#X, count#X], [plan_id=X], [shuffle_writer_type=hash] (28) ShuffleQueryStage Output [3]: [l_partkey#X, sum#X, count#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark32/18.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark32/18.txt index 7845c0868dc36..b267b885d65cf 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark32/18.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark32/18.txt @@ -124,7 +124,7 @@ Arguments: X, X (6) ColumnarExchange Input [3]: [hash_partition_key#X, c_custkey#X, c_name#X] -Arguments: hashpartitioning(c_custkey#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X, c_name#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(c_custkey#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X, c_name#X], [plan_id=X], [shuffle_writer_type=hash] (7) ShuffleQueryStage Output [2]: [c_custkey#X, c_name#X] @@ -161,7 +161,7 @@ Arguments: X, X (15) ColumnarExchange Input [5]: [hash_partition_key#X, o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X] -Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X], [plan_id=X], [shuffle_writer_type=hash] (16) ShuffleQueryStage Output [4]: [o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X] @@ -200,7 +200,7 @@ Arguments: X, X (24) ColumnarExchange Input [4]: [hash_partition_key#X, l_orderkey#X, sum#X, isEmpty#X] -Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, sum#X, isEmpty#X], [plan_id=X], [shuffle_writer_type=hash] (25) ShuffleQueryStage Output [3]: [l_orderkey#X, sum#X, isEmpty#X] @@ -246,7 +246,7 @@ Arguments: X, X (35) ColumnarExchange Input [5]: [hash_partition_key#X, o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X] -Arguments: hashpartitioning(o_custkey#X, 1), ENSURE_REQUIREMENTS, [o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(o_custkey#X, 1), ENSURE_REQUIREMENTS, [o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X], [plan_id=X], [shuffle_writer_type=hash] (36) ShuffleQueryStage Output [4]: [o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X] @@ -277,7 +277,7 @@ Arguments: X, X (43) ColumnarExchange Input [6]: [hash_partition_key#X, c_custkey#X, c_name#X, o_orderkey#X, o_totalprice#X, o_orderdate#X] -Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X, c_name#X, o_orderkey#X, o_totalprice#X, o_orderdate#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X, c_name#X, o_orderkey#X, o_totalprice#X, o_orderdate#X], [plan_id=X], [shuffle_writer_type=hash] (44) ShuffleQueryStage Output [5]: [c_custkey#X, c_name#X, o_orderkey#X, o_totalprice#X, o_orderdate#X] @@ -314,7 +314,7 @@ Arguments: X, X (52) ColumnarExchange Input [3]: [hash_partition_key#X, l_orderkey#X, l_quantity#X] -Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_quantity#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_quantity#X], [plan_id=X], [shuffle_writer_type=hash] (53) ShuffleQueryStage Output [2]: [l_orderkey#X, l_quantity#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark32/19.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark32/19.txt index a6fc173e233bb..7ee4b49b8dfe5 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark32/19.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark32/19.txt @@ -61,7 +61,7 @@ Arguments: X, X (6) ColumnarExchange Input [5]: [hash_partition_key#X, l_partkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] -Arguments: hashpartitioning(l_partkey#X, 1), ENSURE_REQUIREMENTS, [l_partkey#X, l_quantity#X, l_extendedprice#X, l_discount#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(l_partkey#X, 1), ENSURE_REQUIREMENTS, [l_partkey#X, l_quantity#X, l_extendedprice#X, l_discount#X], [plan_id=X], [shuffle_writer_type=hash] (7) ShuffleQueryStage Output [4]: [l_partkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] @@ -98,7 +98,7 @@ Arguments: X, X (15) ColumnarExchange Input [5]: [hash_partition_key#X, p_partkey#X, p_brand#X, p_size#X, p_container#X] -Arguments: hashpartitioning(p_partkey#X, 1), ENSURE_REQUIREMENTS, [p_partkey#X, p_brand#X, p_size#X, p_container#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(p_partkey#X, 1), ENSURE_REQUIREMENTS, [p_partkey#X, p_brand#X, p_size#X, p_container#X], [plan_id=X], [shuffle_writer_type=hash] (16) ShuffleQueryStage Output [4]: [p_partkey#X, p_brand#X, p_size#X, p_container#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark32/20.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark32/20.txt index 46cf1c79418f0..6c3340893e183 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark32/20.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark32/20.txt @@ -152,7 +152,7 @@ Arguments: X, X (6) ColumnarExchange Input [5]: [hash_partition_key#X, s_suppkey#X, s_name#X, s_address#X, s_nationkey#X] -Arguments: hashpartitioning(s_suppkey#X, 1), ENSURE_REQUIREMENTS, [s_suppkey#X, s_name#X, s_address#X, s_nationkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(s_suppkey#X, 1), ENSURE_REQUIREMENTS, [s_suppkey#X, s_name#X, s_address#X, s_nationkey#X], [plan_id=X], [shuffle_writer_type=hash] (7) ShuffleQueryStage Output [4]: [s_suppkey#X, s_name#X, s_address#X, s_nationkey#X] @@ -189,7 +189,7 @@ Arguments: X, X (15) ColumnarExchange Input [4]: [hash_partition_key#X, ps_partkey#X, ps_suppkey#X, ps_availqty#X] -Arguments: hashpartitioning(ps_partkey#X, 1), ENSURE_REQUIREMENTS, [ps_partkey#X, ps_suppkey#X, ps_availqty#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(ps_partkey#X, 1), ENSURE_REQUIREMENTS, [ps_partkey#X, ps_suppkey#X, ps_availqty#X], [plan_id=X], [shuffle_writer_type=hash] (16) ShuffleQueryStage Output [3]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X] @@ -226,7 +226,7 @@ Arguments: X, X (24) ColumnarExchange Input [2]: [hash_partition_key#X, p_partkey#X] -Arguments: hashpartitioning(p_partkey#X, 1), ENSURE_REQUIREMENTS, [p_partkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(p_partkey#X, 1), ENSURE_REQUIREMENTS, [p_partkey#X], [plan_id=X], [shuffle_writer_type=hash] (25) ShuffleQueryStage Output [1]: [p_partkey#X] @@ -257,7 +257,7 @@ Arguments: X, X (32) ColumnarExchange Input [4]: [hash_partition_key#X, ps_partkey#X, ps_suppkey#X, ps_availqty#X] -Arguments: hashpartitioning(ps_partkey#X, ps_suppkey#X, 1), ENSURE_REQUIREMENTS, [ps_partkey#X, ps_suppkey#X, ps_availqty#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(ps_partkey#X, ps_suppkey#X, 1), ENSURE_REQUIREMENTS, [ps_partkey#X, ps_suppkey#X, ps_availqty#X], [plan_id=X], [shuffle_writer_type=hash] (33) ShuffleQueryStage Output [3]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X] @@ -294,7 +294,7 @@ Arguments: X, X (41) ColumnarExchange Input [4]: [hash_partition_key#X, l_partkey#X, l_suppkey#X, l_quantity#X] -Arguments: hashpartitioning(l_partkey#X, 1), ENSURE_REQUIREMENTS, [l_partkey#X, l_suppkey#X, l_quantity#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(l_partkey#X, 1), ENSURE_REQUIREMENTS, [l_partkey#X, l_suppkey#X, l_quantity#X], [plan_id=X], [shuffle_writer_type=hash] (42) ShuffleQueryStage Output [3]: [l_partkey#X, l_suppkey#X, l_quantity#X] @@ -360,7 +360,7 @@ Arguments: X, X (57) ColumnarExchange Input [4]: [hash_partition_key#X, (0.5 * sum(l_quantity))#X, l_partkey#X, l_suppkey#X] -Arguments: hashpartitioning(l_partkey#X, l_suppkey#X, 1), ENSURE_REQUIREMENTS, [(0.5 * sum(l_quantity))#X, l_partkey#X, l_suppkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(l_partkey#X, l_suppkey#X, 1), ENSURE_REQUIREMENTS, [(0.5 * sum(l_quantity))#X, l_partkey#X, l_suppkey#X], [plan_id=X], [shuffle_writer_type=hash] (58) ShuffleQueryStage Output [3]: [(0.5 * sum(l_quantity))#X, l_partkey#X, l_suppkey#X] @@ -391,7 +391,7 @@ Arguments: X, X (65) ColumnarExchange Input [2]: [hash_partition_key#X, ps_suppkey#X] -Arguments: hashpartitioning(ps_suppkey#X, 1), ENSURE_REQUIREMENTS, [ps_suppkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(ps_suppkey#X, 1), ENSURE_REQUIREMENTS, [ps_suppkey#X], [plan_id=X], [shuffle_writer_type=hash] (66) ShuffleQueryStage Output [1]: [ps_suppkey#X] @@ -422,7 +422,7 @@ Arguments: X, X (73) ColumnarExchange Input [4]: [hash_partition_key#X, s_name#X, s_address#X, s_nationkey#X] -Arguments: hashpartitioning(s_nationkey#X, 1), ENSURE_REQUIREMENTS, [s_name#X, s_address#X, s_nationkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(s_nationkey#X, 1), ENSURE_REQUIREMENTS, [s_name#X, s_address#X, s_nationkey#X], [plan_id=X], [shuffle_writer_type=hash] (74) ShuffleQueryStage Output [3]: [s_name#X, s_address#X, s_nationkey#X] @@ -459,7 +459,7 @@ Arguments: X, X (82) ColumnarExchange Input [2]: [hash_partition_key#X, n_nationkey#X] -Arguments: hashpartitioning(n_nationkey#X, 1), ENSURE_REQUIREMENTS, [n_nationkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(n_nationkey#X, 1), ENSURE_REQUIREMENTS, [n_nationkey#X], [plan_id=X], [shuffle_writer_type=hash] (83) ShuffleQueryStage Output [1]: [n_nationkey#X] @@ -490,7 +490,7 @@ Arguments: X, X (90) ColumnarExchange Input [2]: [s_name#X, s_address#X] -Arguments: rangepartitioning(s_name#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] +Arguments: rangepartitioning(s_name#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [shuffle_writer_type=hash] (91) ShuffleQueryStage Output [2]: [s_name#X, s_address#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark32/21.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark32/21.txt index e1e46cf28ffdd..91a56bf6b7c5d 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark32/21.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark32/21.txt @@ -145,7 +145,7 @@ Arguments: X, X (6) ColumnarExchange Input [4]: [hash_partition_key#X, s_suppkey#X, s_name#X, s_nationkey#X] -Arguments: hashpartitioning(s_suppkey#X, 1), ENSURE_REQUIREMENTS, [s_suppkey#X, s_name#X, s_nationkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(s_suppkey#X, 1), ENSURE_REQUIREMENTS, [s_suppkey#X, s_name#X, s_nationkey#X], [plan_id=X], [shuffle_writer_type=hash] (7) ShuffleQueryStage Output [3]: [s_suppkey#X, s_name#X, s_nationkey#X] @@ -182,7 +182,7 @@ Arguments: X, X (15) ColumnarExchange Input [3]: [hash_partition_key#X, l_orderkey#X, l_suppkey#X] -Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_suppkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_suppkey#X], [plan_id=X], [shuffle_writer_type=hash] (16) ShuffleQueryStage Output [2]: [l_orderkey#X, l_suppkey#X] @@ -214,7 +214,7 @@ Arguments: X, X (23) ColumnarExchange Input [3]: [hash_partition_key#X, l_orderkey#X, l_suppkey#X] -Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_suppkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_suppkey#X], [plan_id=X], [shuffle_writer_type=hash] (24) ShuffleQueryStage Output [2]: [l_orderkey#X, l_suppkey#X] @@ -256,7 +256,7 @@ Arguments: X, X (33) ColumnarExchange Input [3]: [hash_partition_key#X, l_orderkey#X, l_suppkey#X] -Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_suppkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_suppkey#X], [plan_id=X], [shuffle_writer_type=hash] (34) ShuffleQueryStage Output [2]: [l_orderkey#X, l_suppkey#X] @@ -287,7 +287,7 @@ Arguments: X, X (41) ColumnarExchange Input [3]: [hash_partition_key#X, l_orderkey#X, l_suppkey#X] -Arguments: hashpartitioning(l_suppkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_suppkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(l_suppkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_suppkey#X], [plan_id=X], [shuffle_writer_type=hash] (42) ShuffleQueryStage Output [2]: [l_orderkey#X, l_suppkey#X] @@ -318,7 +318,7 @@ Arguments: X, X (49) ColumnarExchange Input [4]: [hash_partition_key#X, s_name#X, s_nationkey#X, l_orderkey#X] -Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [s_name#X, s_nationkey#X, l_orderkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [s_name#X, s_nationkey#X, l_orderkey#X], [plan_id=X], [shuffle_writer_type=hash] (50) ShuffleQueryStage Output [3]: [s_name#X, s_nationkey#X, l_orderkey#X] @@ -355,7 +355,7 @@ Arguments: X, X (58) ColumnarExchange Input [2]: [hash_partition_key#X, o_orderkey#X] -Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [o_orderkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [o_orderkey#X], [plan_id=X], [shuffle_writer_type=hash] (59) ShuffleQueryStage Output [1]: [o_orderkey#X] @@ -386,7 +386,7 @@ Arguments: X, X (66) ColumnarExchange Input [3]: [hash_partition_key#X, s_name#X, s_nationkey#X] -Arguments: hashpartitioning(s_nationkey#X, 1), ENSURE_REQUIREMENTS, [s_name#X, s_nationkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(s_nationkey#X, 1), ENSURE_REQUIREMENTS, [s_name#X, s_nationkey#X], [plan_id=X], [shuffle_writer_type=hash] (67) ShuffleQueryStage Output [2]: [s_name#X, s_nationkey#X] @@ -423,7 +423,7 @@ Arguments: X, X (75) ColumnarExchange Input [2]: [hash_partition_key#X, n_nationkey#X] -Arguments: hashpartitioning(n_nationkey#X, 1), ENSURE_REQUIREMENTS, [n_nationkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(n_nationkey#X, 1), ENSURE_REQUIREMENTS, [n_nationkey#X], [plan_id=X], [shuffle_writer_type=hash] (76) ShuffleQueryStage Output [1]: [n_nationkey#X] @@ -465,7 +465,7 @@ Arguments: X, X (85) ColumnarExchange Input [3]: [hash_partition_key#X, s_name#X, count#X] -Arguments: hashpartitioning(s_name#X, 1), ENSURE_REQUIREMENTS, [s_name#X, count#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(s_name#X, 1), ENSURE_REQUIREMENTS, [s_name#X, count#X], [plan_id=X], [shuffle_writer_type=hash] (86) ShuffleQueryStage Output [2]: [s_name#X, count#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark32/22.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark32/22.txt index 8582ecdb64d43..01b1f033b5193 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark32/22.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark32/22.txt @@ -71,7 +71,7 @@ Arguments: X, X (6) ColumnarExchange Input [4]: [hash_partition_key#X, c_custkey#X, c_phone#X, c_acctbal#X] -Arguments: hashpartitioning(c_custkey#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X, c_phone#X, c_acctbal#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(c_custkey#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X, c_phone#X, c_acctbal#X], [plan_id=X], [shuffle_writer_type=hash] (7) ShuffleQueryStage Output [3]: [c_custkey#X, c_phone#X, c_acctbal#X] @@ -103,7 +103,7 @@ Arguments: X, X (14) ColumnarExchange Input [2]: [hash_partition_key#X, o_custkey#X] -Arguments: hashpartitioning(o_custkey#X, 1), ENSURE_REQUIREMENTS, [o_custkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(o_custkey#X, 1), ENSURE_REQUIREMENTS, [o_custkey#X], [plan_id=X], [shuffle_writer_type=hash] (15) ShuffleQueryStage Output [1]: [o_custkey#X] @@ -145,7 +145,7 @@ Arguments: X, X (24) ColumnarExchange Input [5]: [hash_partition_key#X, cntrycode#X, count#X, sum#X, isEmpty#X] -Arguments: hashpartitioning(cntrycode#X, 1), ENSURE_REQUIREMENTS, [cntrycode#X, count#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(cntrycode#X, 1), ENSURE_REQUIREMENTS, [cntrycode#X, count#X, sum#X, isEmpty#X], [plan_id=X], [shuffle_writer_type=hash] (25) ShuffleQueryStage Output [4]: [cntrycode#X, count#X, sum#X, isEmpty#X] @@ -174,7 +174,7 @@ Arguments: X, X (31) ColumnarExchange Input [3]: [cntrycode#X, numcust#X, totacctbal#X] -Arguments: rangepartitioning(cntrycode#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] +Arguments: rangepartitioning(cntrycode#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [shuffle_writer_type=hash] (32) ShuffleQueryStage Output [3]: [cntrycode#X, numcust#X, totacctbal#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark32/3.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark32/3.txt index f1f128ce00fbf..44aea5e819439 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark32/3.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark32/3.txt @@ -86,7 +86,7 @@ Arguments: X, X (6) ColumnarExchange Input [2]: [hash_partition_key#X, c_custkey#X] -Arguments: hashpartitioning(c_custkey#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(c_custkey#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X], [plan_id=X], [shuffle_writer_type=hash] (7) ShuffleQueryStage Output [1]: [c_custkey#X] @@ -123,7 +123,7 @@ Arguments: X, X (15) ColumnarExchange Input [5]: [hash_partition_key#X, o_orderkey#X, o_custkey#X, o_orderdate#X, o_shippriority#X] -Arguments: hashpartitioning(o_custkey#X, 1), ENSURE_REQUIREMENTS, [o_orderkey#X, o_custkey#X, o_orderdate#X, o_shippriority#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(o_custkey#X, 1), ENSURE_REQUIREMENTS, [o_orderkey#X, o_custkey#X, o_orderdate#X, o_shippriority#X], [plan_id=X], [shuffle_writer_type=hash] (16) ShuffleQueryStage Output [4]: [o_orderkey#X, o_custkey#X, o_orderdate#X, o_shippriority#X] @@ -154,7 +154,7 @@ Arguments: X, X (23) ColumnarExchange Input [4]: [hash_partition_key#X, o_orderkey#X, o_orderdate#X, o_shippriority#X] -Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [o_orderkey#X, o_orderdate#X, o_shippriority#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [o_orderkey#X, o_orderdate#X, o_shippriority#X], [plan_id=X], [shuffle_writer_type=hash] (24) ShuffleQueryStage Output [3]: [o_orderkey#X, o_orderdate#X, o_shippriority#X] @@ -191,7 +191,7 @@ Arguments: X, X (32) ColumnarExchange Input [4]: [hash_partition_key#X, l_orderkey#X, l_extendedprice#X, l_discount#X] -Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_extendedprice#X, l_discount#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_extendedprice#X, l_discount#X], [plan_id=X], [shuffle_writer_type=hash] (33) ShuffleQueryStage Output [3]: [l_orderkey#X, l_extendedprice#X, l_discount#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark32/4.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark32/4.txt index 6eb069e562dec..c192184acd31d 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark32/4.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark32/4.txt @@ -75,7 +75,7 @@ Arguments: X, X (6) ColumnarExchange Input [3]: [hash_partition_key#X, o_orderkey#X, o_orderpriority#X] -Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [o_orderkey#X, o_orderpriority#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [o_orderkey#X, o_orderpriority#X], [plan_id=X], [shuffle_writer_type=hash] (7) ShuffleQueryStage Output [2]: [o_orderkey#X, o_orderpriority#X] @@ -112,7 +112,7 @@ Arguments: X, X (15) ColumnarExchange Input [2]: [hash_partition_key#X, l_orderkey#X] -Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X], [plan_id=X], [shuffle_writer_type=hash] (16) ShuffleQueryStage Output [1]: [l_orderkey#X] @@ -154,7 +154,7 @@ Arguments: X, X (25) ColumnarExchange Input [3]: [hash_partition_key#X, o_orderpriority#X, count#X] -Arguments: hashpartitioning(o_orderpriority#X, 1), ENSURE_REQUIREMENTS, [o_orderpriority#X, count#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(o_orderpriority#X, 1), ENSURE_REQUIREMENTS, [o_orderpriority#X, count#X], [plan_id=X], [shuffle_writer_type=hash] (26) ShuffleQueryStage Output [2]: [o_orderpriority#X, count#X] @@ -183,7 +183,7 @@ Arguments: X, X (32) ColumnarExchange Input [2]: [o_orderpriority#X, order_count#X] -Arguments: rangepartitioning(o_orderpriority#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] +Arguments: rangepartitioning(o_orderpriority#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [shuffle_writer_type=hash] (33) ShuffleQueryStage Output [2]: [o_orderpriority#X, order_count#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark32/5.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark32/5.txt index 1c71444a83d4f..6c4e6aa31ebc3 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark32/5.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark32/5.txt @@ -159,7 +159,7 @@ Arguments: X, X (6) ColumnarExchange Input [3]: [hash_partition_key#X, c_custkey#X, c_nationkey#X] -Arguments: hashpartitioning(c_custkey#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X, c_nationkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(c_custkey#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X, c_nationkey#X], [plan_id=X], [shuffle_writer_type=hash] (7) ShuffleQueryStage Output [2]: [c_custkey#X, c_nationkey#X] @@ -196,7 +196,7 @@ Arguments: X, X (15) ColumnarExchange Input [3]: [hash_partition_key#X, o_orderkey#X, o_custkey#X] -Arguments: hashpartitioning(o_custkey#X, 1), ENSURE_REQUIREMENTS, [o_orderkey#X, o_custkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(o_custkey#X, 1), ENSURE_REQUIREMENTS, [o_orderkey#X, o_custkey#X], [plan_id=X], [shuffle_writer_type=hash] (16) ShuffleQueryStage Output [2]: [o_orderkey#X, o_custkey#X] @@ -227,7 +227,7 @@ Arguments: X, X (23) ColumnarExchange Input [3]: [hash_partition_key#X, c_nationkey#X, o_orderkey#X] -Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [c_nationkey#X, o_orderkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [c_nationkey#X, o_orderkey#X], [plan_id=X], [shuffle_writer_type=hash] (24) ShuffleQueryStage Output [2]: [c_nationkey#X, o_orderkey#X] @@ -264,7 +264,7 @@ Arguments: X, X (32) ColumnarExchange Input [5]: [hash_partition_key#X, l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] -Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X], [plan_id=X], [shuffle_writer_type=hash] (33) ShuffleQueryStage Output [4]: [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] @@ -295,7 +295,7 @@ Arguments: X, X (40) ColumnarExchange Input [5]: [hash_partition_key#X, c_nationkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] -Arguments: hashpartitioning(l_suppkey#X, c_nationkey#X, 1), ENSURE_REQUIREMENTS, [c_nationkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(l_suppkey#X, c_nationkey#X, 1), ENSURE_REQUIREMENTS, [c_nationkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X], [plan_id=X], [shuffle_writer_type=hash] (41) ShuffleQueryStage Output [4]: [c_nationkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] @@ -332,7 +332,7 @@ Arguments: X, X (49) ColumnarExchange Input [3]: [hash_partition_key#X, s_suppkey#X, s_nationkey#X] -Arguments: hashpartitioning(s_suppkey#X, s_nationkey#X, 1), ENSURE_REQUIREMENTS, [s_suppkey#X, s_nationkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(s_suppkey#X, s_nationkey#X, 1), ENSURE_REQUIREMENTS, [s_suppkey#X, s_nationkey#X], [plan_id=X], [shuffle_writer_type=hash] (50) ShuffleQueryStage Output [2]: [s_suppkey#X, s_nationkey#X] @@ -363,7 +363,7 @@ Arguments: X, X (57) ColumnarExchange Input [4]: [hash_partition_key#X, l_extendedprice#X, l_discount#X, s_nationkey#X] -Arguments: hashpartitioning(s_nationkey#X, 1), ENSURE_REQUIREMENTS, [l_extendedprice#X, l_discount#X, s_nationkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(s_nationkey#X, 1), ENSURE_REQUIREMENTS, [l_extendedprice#X, l_discount#X, s_nationkey#X], [plan_id=X], [shuffle_writer_type=hash] (58) ShuffleQueryStage Output [3]: [l_extendedprice#X, l_discount#X, s_nationkey#X] @@ -400,7 +400,7 @@ Arguments: X, X (66) ColumnarExchange Input [4]: [hash_partition_key#X, n_nationkey#X, n_name#X, n_regionkey#X] -Arguments: hashpartitioning(n_nationkey#X, 1), ENSURE_REQUIREMENTS, [n_nationkey#X, n_name#X, n_regionkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(n_nationkey#X, 1), ENSURE_REQUIREMENTS, [n_nationkey#X, n_name#X, n_regionkey#X], [plan_id=X], [shuffle_writer_type=hash] (67) ShuffleQueryStage Output [3]: [n_nationkey#X, n_name#X, n_regionkey#X] @@ -431,7 +431,7 @@ Arguments: X, X (74) ColumnarExchange Input [5]: [hash_partition_key#X, l_extendedprice#X, l_discount#X, n_name#X, n_regionkey#X] -Arguments: hashpartitioning(n_regionkey#X, 1), ENSURE_REQUIREMENTS, [l_extendedprice#X, l_discount#X, n_name#X, n_regionkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(n_regionkey#X, 1), ENSURE_REQUIREMENTS, [l_extendedprice#X, l_discount#X, n_name#X, n_regionkey#X], [plan_id=X], [shuffle_writer_type=hash] (75) ShuffleQueryStage Output [4]: [l_extendedprice#X, l_discount#X, n_name#X, n_regionkey#X] @@ -468,7 +468,7 @@ Arguments: X, X (83) ColumnarExchange Input [2]: [hash_partition_key#X, r_regionkey#X] -Arguments: hashpartitioning(r_regionkey#X, 1), ENSURE_REQUIREMENTS, [r_regionkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(r_regionkey#X, 1), ENSURE_REQUIREMENTS, [r_regionkey#X], [plan_id=X], [shuffle_writer_type=hash] (84) ShuffleQueryStage Output [1]: [r_regionkey#X] @@ -510,7 +510,7 @@ Arguments: X, X (93) ColumnarExchange Input [4]: [hash_partition_key#X, n_name#X, sum#X, isEmpty#X] -Arguments: hashpartitioning(n_name#X, 1), ENSURE_REQUIREMENTS, [n_name#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(n_name#X, 1), ENSURE_REQUIREMENTS, [n_name#X, sum#X, isEmpty#X], [plan_id=X], [shuffle_writer_type=hash] (94) ShuffleQueryStage Output [3]: [n_name#X, sum#X, isEmpty#X] @@ -539,7 +539,7 @@ Arguments: X, X (100) ColumnarExchange Input [2]: [n_name#X, revenue#X] -Arguments: rangepartitioning(revenue#X DESC NULLS LAST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] +Arguments: rangepartitioning(revenue#X DESC NULLS LAST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [shuffle_writer_type=hash] (101) ShuffleQueryStage Output [2]: [n_name#X, revenue#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark32/6.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark32/6.txt index f30bcb0da8014..451b7abd33b43 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark32/6.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark32/6.txt @@ -52,7 +52,7 @@ Arguments: X, X (7) ColumnarExchange Input [2]: [sum#X, isEmpty#X] -Arguments: SinglePartition, ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] +Arguments: SinglePartition, ENSURE_REQUIREMENTS, [plan_id=X], [shuffle_writer_type=hash] (8) ShuffleQueryStage Output [2]: [sum#X, isEmpty#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark32/7.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark32/7.txt index 0179c5ce37f1d..ba296c641bb94 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark32/7.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark32/7.txt @@ -153,7 +153,7 @@ Arguments: X, X (6) ColumnarExchange Input [3]: [hash_partition_key#X, s_suppkey#X, s_nationkey#X] -Arguments: hashpartitioning(s_suppkey#X, 1), ENSURE_REQUIREMENTS, [s_suppkey#X, s_nationkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(s_suppkey#X, 1), ENSURE_REQUIREMENTS, [s_suppkey#X, s_nationkey#X], [plan_id=X], [shuffle_writer_type=hash] (7) ShuffleQueryStage Output [2]: [s_suppkey#X, s_nationkey#X] @@ -190,7 +190,7 @@ Arguments: X, X (15) ColumnarExchange Input [6]: [hash_partition_key#X, l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] -Arguments: hashpartitioning(l_suppkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(l_suppkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X], [plan_id=X], [shuffle_writer_type=hash] (16) ShuffleQueryStage Output [5]: [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] @@ -221,7 +221,7 @@ Arguments: X, X (23) ColumnarExchange Input [6]: [hash_partition_key#X, s_nationkey#X, l_orderkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] -Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [s_nationkey#X, l_orderkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [s_nationkey#X, l_orderkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X], [plan_id=X], [shuffle_writer_type=hash] (24) ShuffleQueryStage Output [5]: [s_nationkey#X, l_orderkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] @@ -258,7 +258,7 @@ Arguments: X, X (32) ColumnarExchange Input [3]: [hash_partition_key#X, o_orderkey#X, o_custkey#X] -Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [o_orderkey#X, o_custkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [o_orderkey#X, o_custkey#X], [plan_id=X], [shuffle_writer_type=hash] (33) ShuffleQueryStage Output [2]: [o_orderkey#X, o_custkey#X] @@ -289,7 +289,7 @@ Arguments: X, X (40) ColumnarExchange Input [6]: [hash_partition_key#X, s_nationkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, o_custkey#X] -Arguments: hashpartitioning(o_custkey#X, 1), ENSURE_REQUIREMENTS, [s_nationkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, o_custkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(o_custkey#X, 1), ENSURE_REQUIREMENTS, [s_nationkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, o_custkey#X], [plan_id=X], [shuffle_writer_type=hash] (41) ShuffleQueryStage Output [5]: [s_nationkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, o_custkey#X] @@ -326,7 +326,7 @@ Arguments: X, X (49) ColumnarExchange Input [3]: [hash_partition_key#X, c_custkey#X, c_nationkey#X] -Arguments: hashpartitioning(c_custkey#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X, c_nationkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(c_custkey#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X, c_nationkey#X], [plan_id=X], [shuffle_writer_type=hash] (50) ShuffleQueryStage Output [2]: [c_custkey#X, c_nationkey#X] @@ -357,7 +357,7 @@ Arguments: X, X (57) ColumnarExchange Input [6]: [hash_partition_key#X, s_nationkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, c_nationkey#X] -Arguments: hashpartitioning(s_nationkey#X, 1), ENSURE_REQUIREMENTS, [s_nationkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, c_nationkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(s_nationkey#X, 1), ENSURE_REQUIREMENTS, [s_nationkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, c_nationkey#X], [plan_id=X], [shuffle_writer_type=hash] (58) ShuffleQueryStage Output [5]: [s_nationkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, c_nationkey#X] @@ -394,7 +394,7 @@ Arguments: X, X (66) ColumnarExchange Input [3]: [hash_partition_key#X, n_nationkey#X, n_name#X] -Arguments: hashpartitioning(n_nationkey#X, 1), ENSURE_REQUIREMENTS, [n_nationkey#X, n_name#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(n_nationkey#X, 1), ENSURE_REQUIREMENTS, [n_nationkey#X, n_name#X], [plan_id=X], [shuffle_writer_type=hash] (67) ShuffleQueryStage Output [2]: [n_nationkey#X, n_name#X] @@ -425,7 +425,7 @@ Arguments: X, X (74) ColumnarExchange Input [6]: [hash_partition_key#X, l_extendedprice#X, l_discount#X, l_shipdate#X, c_nationkey#X, n_name#X] -Arguments: hashpartitioning(c_nationkey#X, 1), ENSURE_REQUIREMENTS, [l_extendedprice#X, l_discount#X, l_shipdate#X, c_nationkey#X, n_name#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(c_nationkey#X, 1), ENSURE_REQUIREMENTS, [l_extendedprice#X, l_discount#X, l_shipdate#X, c_nationkey#X, n_name#X], [plan_id=X], [shuffle_writer_type=hash] (75) ShuffleQueryStage Output [5]: [l_extendedprice#X, l_discount#X, l_shipdate#X, c_nationkey#X, n_name#X] @@ -480,7 +480,7 @@ Arguments: X, X (88) ColumnarExchange Input [6]: [hash_partition_key#X, supp_nation#X, cust_nation#X, l_year#X, sum#X, isEmpty#X] -Arguments: hashpartitioning(supp_nation#X, cust_nation#X, l_year#X, 1), ENSURE_REQUIREMENTS, [supp_nation#X, cust_nation#X, l_year#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(supp_nation#X, cust_nation#X, l_year#X, 1), ENSURE_REQUIREMENTS, [supp_nation#X, cust_nation#X, l_year#X, sum#X, isEmpty#X], [plan_id=X], [shuffle_writer_type=hash] (89) ShuffleQueryStage Output [5]: [supp_nation#X, cust_nation#X, l_year#X, sum#X, isEmpty#X] @@ -509,7 +509,7 @@ Arguments: X, X (95) ColumnarExchange Input [4]: [supp_nation#X, cust_nation#X, l_year#X, revenue#X] -Arguments: rangepartitioning(supp_nation#X ASC NULLS FIRST, cust_nation#X ASC NULLS FIRST, l_year#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] +Arguments: rangepartitioning(supp_nation#X ASC NULLS FIRST, cust_nation#X ASC NULLS FIRST, l_year#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [shuffle_writer_type=hash] (96) ShuffleQueryStage Output [4]: [supp_nation#X, cust_nation#X, l_year#X, revenue#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark32/8.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark32/8.txt index ce21c6ca2cae4..f75207dbdca96 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark32/8.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark32/8.txt @@ -202,7 +202,7 @@ Arguments: X, X (6) ColumnarExchange Input [2]: [hash_partition_key#X, p_partkey#X] -Arguments: hashpartitioning(p_partkey#X, 1), ENSURE_REQUIREMENTS, [p_partkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(p_partkey#X, 1), ENSURE_REQUIREMENTS, [p_partkey#X], [plan_id=X], [shuffle_writer_type=hash] (7) ShuffleQueryStage Output [1]: [p_partkey#X] @@ -239,7 +239,7 @@ Arguments: X, X (15) ColumnarExchange Input [6]: [hash_partition_key#X, l_orderkey#X, l_partkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] -Arguments: hashpartitioning(l_partkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_partkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(l_partkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_partkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X], [plan_id=X], [shuffle_writer_type=hash] (16) ShuffleQueryStage Output [5]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] @@ -270,7 +270,7 @@ Arguments: X, X (23) ColumnarExchange Input [5]: [hash_partition_key#X, l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] -Arguments: hashpartitioning(l_suppkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(l_suppkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X], [plan_id=X], [shuffle_writer_type=hash] (24) ShuffleQueryStage Output [4]: [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] @@ -307,7 +307,7 @@ Arguments: X, X (32) ColumnarExchange Input [3]: [hash_partition_key#X, s_suppkey#X, s_nationkey#X] -Arguments: hashpartitioning(s_suppkey#X, 1), ENSURE_REQUIREMENTS, [s_suppkey#X, s_nationkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(s_suppkey#X, 1), ENSURE_REQUIREMENTS, [s_suppkey#X, s_nationkey#X], [plan_id=X], [shuffle_writer_type=hash] (33) ShuffleQueryStage Output [2]: [s_suppkey#X, s_nationkey#X] @@ -338,7 +338,7 @@ Arguments: X, X (40) ColumnarExchange Input [5]: [hash_partition_key#X, l_orderkey#X, l_extendedprice#X, l_discount#X, s_nationkey#X] -Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_extendedprice#X, l_discount#X, s_nationkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_extendedprice#X, l_discount#X, s_nationkey#X], [plan_id=X], [shuffle_writer_type=hash] (41) ShuffleQueryStage Output [4]: [l_orderkey#X, l_extendedprice#X, l_discount#X, s_nationkey#X] @@ -375,7 +375,7 @@ Arguments: X, X (49) ColumnarExchange Input [4]: [hash_partition_key#X, o_orderkey#X, o_custkey#X, o_orderdate#X] -Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [o_orderkey#X, o_custkey#X, o_orderdate#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [o_orderkey#X, o_custkey#X, o_orderdate#X], [plan_id=X], [shuffle_writer_type=hash] (50) ShuffleQueryStage Output [3]: [o_orderkey#X, o_custkey#X, o_orderdate#X] @@ -406,7 +406,7 @@ Arguments: X, X (57) ColumnarExchange Input [6]: [hash_partition_key#X, l_extendedprice#X, l_discount#X, s_nationkey#X, o_custkey#X, o_orderdate#X] -Arguments: hashpartitioning(o_custkey#X, 1), ENSURE_REQUIREMENTS, [l_extendedprice#X, l_discount#X, s_nationkey#X, o_custkey#X, o_orderdate#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(o_custkey#X, 1), ENSURE_REQUIREMENTS, [l_extendedprice#X, l_discount#X, s_nationkey#X, o_custkey#X, o_orderdate#X], [plan_id=X], [shuffle_writer_type=hash] (58) ShuffleQueryStage Output [5]: [l_extendedprice#X, l_discount#X, s_nationkey#X, o_custkey#X, o_orderdate#X] @@ -443,7 +443,7 @@ Arguments: X, X (66) ColumnarExchange Input [3]: [hash_partition_key#X, c_custkey#X, c_nationkey#X] -Arguments: hashpartitioning(c_custkey#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X, c_nationkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(c_custkey#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X, c_nationkey#X], [plan_id=X], [shuffle_writer_type=hash] (67) ShuffleQueryStage Output [2]: [c_custkey#X, c_nationkey#X] @@ -474,7 +474,7 @@ Arguments: X, X (74) ColumnarExchange Input [6]: [hash_partition_key#X, l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderdate#X, c_nationkey#X] -Arguments: hashpartitioning(c_nationkey#X, 1), ENSURE_REQUIREMENTS, [l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderdate#X, c_nationkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(c_nationkey#X, 1), ENSURE_REQUIREMENTS, [l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderdate#X, c_nationkey#X], [plan_id=X], [shuffle_writer_type=hash] (75) ShuffleQueryStage Output [5]: [l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderdate#X, c_nationkey#X] @@ -511,7 +511,7 @@ Arguments: X, X (83) ColumnarExchange Input [3]: [hash_partition_key#X, n_nationkey#X, n_regionkey#X] -Arguments: hashpartitioning(n_nationkey#X, 1), ENSURE_REQUIREMENTS, [n_nationkey#X, n_regionkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(n_nationkey#X, 1), ENSURE_REQUIREMENTS, [n_nationkey#X, n_regionkey#X], [plan_id=X], [shuffle_writer_type=hash] (84) ShuffleQueryStage Output [2]: [n_nationkey#X, n_regionkey#X] @@ -542,7 +542,7 @@ Arguments: X, X (91) ColumnarExchange Input [6]: [hash_partition_key#X, l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderdate#X, n_regionkey#X] -Arguments: hashpartitioning(s_nationkey#X, 1), ENSURE_REQUIREMENTS, [l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderdate#X, n_regionkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(s_nationkey#X, 1), ENSURE_REQUIREMENTS, [l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderdate#X, n_regionkey#X], [plan_id=X], [shuffle_writer_type=hash] (92) ShuffleQueryStage Output [5]: [l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderdate#X, n_regionkey#X] @@ -579,7 +579,7 @@ Arguments: X, X (100) ColumnarExchange Input [3]: [hash_partition_key#X, n_nationkey#X, n_name#X] -Arguments: hashpartitioning(n_nationkey#X, 1), ENSURE_REQUIREMENTS, [n_nationkey#X, n_name#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(n_nationkey#X, 1), ENSURE_REQUIREMENTS, [n_nationkey#X, n_name#X], [plan_id=X], [shuffle_writer_type=hash] (101) ShuffleQueryStage Output [2]: [n_nationkey#X, n_name#X] @@ -610,7 +610,7 @@ Arguments: X, X (108) ColumnarExchange Input [6]: [hash_partition_key#X, l_extendedprice#X, l_discount#X, o_orderdate#X, n_regionkey#X, n_name#X] -Arguments: hashpartitioning(n_regionkey#X, 1), ENSURE_REQUIREMENTS, [l_extendedprice#X, l_discount#X, o_orderdate#X, n_regionkey#X, n_name#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(n_regionkey#X, 1), ENSURE_REQUIREMENTS, [l_extendedprice#X, l_discount#X, o_orderdate#X, n_regionkey#X, n_name#X], [plan_id=X], [shuffle_writer_type=hash] (109) ShuffleQueryStage Output [5]: [l_extendedprice#X, l_discount#X, o_orderdate#X, n_regionkey#X, n_name#X] @@ -647,7 +647,7 @@ Arguments: X, X (117) ColumnarExchange Input [2]: [hash_partition_key#X, r_regionkey#X] -Arguments: hashpartitioning(r_regionkey#X, 1), ENSURE_REQUIREMENTS, [r_regionkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(r_regionkey#X, 1), ENSURE_REQUIREMENTS, [r_regionkey#X], [plan_id=X], [shuffle_writer_type=hash] (118) ShuffleQueryStage Output [1]: [r_regionkey#X] @@ -689,7 +689,7 @@ Arguments: X, X (127) ColumnarExchange Input [6]: [hash_partition_key#X, o_year#X, sum#X, isEmpty#X, sum#X, isEmpty#X] -Arguments: hashpartitioning(o_year#X, 1), ENSURE_REQUIREMENTS, [o_year#X, sum#X, isEmpty#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(o_year#X, 1), ENSURE_REQUIREMENTS, [o_year#X, sum#X, isEmpty#X, sum#X, isEmpty#X], [plan_id=X], [shuffle_writer_type=hash] (128) ShuffleQueryStage Output [5]: [o_year#X, sum#X, isEmpty#X, sum#X, isEmpty#X] @@ -722,7 +722,7 @@ Arguments: X, X (135) ColumnarExchange Input [2]: [o_year#X, mkt_share#X] -Arguments: rangepartitioning(o_year#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] +Arguments: rangepartitioning(o_year#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [shuffle_writer_type=hash] (136) ShuffleQueryStage Output [2]: [o_year#X, mkt_share#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark32/9.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark32/9.txt index 06d8d6e0ef000..79d82de9da661 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark32/9.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark32/9.txt @@ -158,7 +158,7 @@ Arguments: X, X (6) ColumnarExchange Input [2]: [hash_partition_key#X, p_partkey#X] -Arguments: hashpartitioning(p_partkey#X, 1), ENSURE_REQUIREMENTS, [p_partkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(p_partkey#X, 1), ENSURE_REQUIREMENTS, [p_partkey#X], [plan_id=X], [shuffle_writer_type=hash] (7) ShuffleQueryStage Output [1]: [p_partkey#X] @@ -195,7 +195,7 @@ Arguments: X, X (15) ColumnarExchange Input [7]: [hash_partition_key#X, l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] -Arguments: hashpartitioning(l_partkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(l_partkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X], [plan_id=X], [shuffle_writer_type=hash] (16) ShuffleQueryStage Output [6]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] @@ -226,7 +226,7 @@ Arguments: X, X (23) ColumnarExchange Input [7]: [hash_partition_key#X, l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] -Arguments: hashpartitioning(l_suppkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(l_suppkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X], [plan_id=X], [shuffle_writer_type=hash] (24) ShuffleQueryStage Output [6]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] @@ -263,7 +263,7 @@ Arguments: X, X (32) ColumnarExchange Input [3]: [hash_partition_key#X, s_suppkey#X, s_nationkey#X] -Arguments: hashpartitioning(s_suppkey#X, 1), ENSURE_REQUIREMENTS, [s_suppkey#X, s_nationkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(s_suppkey#X, 1), ENSURE_REQUIREMENTS, [s_suppkey#X, s_nationkey#X], [plan_id=X], [shuffle_writer_type=hash] (33) ShuffleQueryStage Output [2]: [s_suppkey#X, s_nationkey#X] @@ -294,7 +294,7 @@ Arguments: X, X (40) ColumnarExchange Input [8]: [hash_partition_key#X, l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X] -Arguments: hashpartitioning(l_suppkey#X, l_partkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(l_suppkey#X, l_partkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X], [plan_id=X], [shuffle_writer_type=hash] (41) ShuffleQueryStage Output [7]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X] @@ -331,7 +331,7 @@ Arguments: X, X (49) ColumnarExchange Input [4]: [hash_partition_key#X, ps_partkey#X, ps_suppkey#X, ps_supplycost#X] -Arguments: hashpartitioning(ps_suppkey#X, ps_partkey#X, 1), ENSURE_REQUIREMENTS, [ps_partkey#X, ps_suppkey#X, ps_supplycost#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(ps_suppkey#X, ps_partkey#X, 1), ENSURE_REQUIREMENTS, [ps_partkey#X, ps_suppkey#X, ps_supplycost#X], [plan_id=X], [shuffle_writer_type=hash] (50) ShuffleQueryStage Output [3]: [ps_partkey#X, ps_suppkey#X, ps_supplycost#X] @@ -362,7 +362,7 @@ Arguments: X, X (57) ColumnarExchange Input [7]: [hash_partition_key#X, l_orderkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_supplycost#X] -Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_supplycost#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_supplycost#X], [plan_id=X], [shuffle_writer_type=hash] (58) ShuffleQueryStage Output [6]: [l_orderkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_supplycost#X] @@ -399,7 +399,7 @@ Arguments: X, X (66) ColumnarExchange Input [3]: [hash_partition_key#X, o_orderkey#X, o_orderdate#X] -Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [o_orderkey#X, o_orderdate#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [o_orderkey#X, o_orderdate#X], [plan_id=X], [shuffle_writer_type=hash] (67) ShuffleQueryStage Output [2]: [o_orderkey#X, o_orderdate#X] @@ -430,7 +430,7 @@ Arguments: X, X (74) ColumnarExchange Input [7]: [hash_partition_key#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_supplycost#X, o_orderdate#X] -Arguments: hashpartitioning(s_nationkey#X, 1), ENSURE_REQUIREMENTS, [l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_supplycost#X, o_orderdate#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(s_nationkey#X, 1), ENSURE_REQUIREMENTS, [l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_supplycost#X, o_orderdate#X], [plan_id=X], [shuffle_writer_type=hash] (75) ShuffleQueryStage Output [6]: [l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_supplycost#X, o_orderdate#X] @@ -467,7 +467,7 @@ Arguments: X, X (83) ColumnarExchange Input [3]: [hash_partition_key#X, n_nationkey#X, n_name#X] -Arguments: hashpartitioning(n_nationkey#X, 1), ENSURE_REQUIREMENTS, [n_nationkey#X, n_name#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(n_nationkey#X, 1), ENSURE_REQUIREMENTS, [n_nationkey#X, n_name#X], [plan_id=X], [shuffle_writer_type=hash] (84) ShuffleQueryStage Output [2]: [n_nationkey#X, n_name#X] @@ -509,7 +509,7 @@ Arguments: X, X (93) ColumnarExchange Input [5]: [hash_partition_key#X, nation#X, o_year#X, sum#X, isEmpty#X] -Arguments: hashpartitioning(nation#X, o_year#X, 1), ENSURE_REQUIREMENTS, [nation#X, o_year#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(nation#X, o_year#X, 1), ENSURE_REQUIREMENTS, [nation#X, o_year#X, sum#X, isEmpty#X], [plan_id=X], [shuffle_writer_type=hash] (94) ShuffleQueryStage Output [4]: [nation#X, o_year#X, sum#X, isEmpty#X] @@ -538,7 +538,7 @@ Arguments: X, X (100) ColumnarExchange Input [3]: [nation#X, o_year#X, sum_profit#X] -Arguments: rangepartitioning(nation#X ASC NULLS FIRST, o_year#X DESC NULLS LAST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] +Arguments: rangepartitioning(nation#X ASC NULLS FIRST, o_year#X DESC NULLS LAST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [shuffle_writer_type=hash] (101) ShuffleQueryStage Output [3]: [nation#X, o_year#X, sum_profit#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark33/1.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark33/1.txt index 159a1598c9bf5..7e1dc79b38aa0 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark33/1.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark33/1.txt @@ -64,7 +64,7 @@ Arguments: X, X (8) ColumnarExchange Input [18]: [hash_partition_key#X, l_returnflag#X, l_linestatus#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, count#X, sum#X, count#X, sum#X, count#X, count#X] -Arguments: hashpartitioning(l_returnflag#X, l_linestatus#X, 1), ENSURE_REQUIREMENTS, [l_returnflag#X, l_linestatus#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, count#X, sum#X, count#X, sum#X, count#X, count#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(l_returnflag#X, l_linestatus#X, 1), ENSURE_REQUIREMENTS, [l_returnflag#X, l_linestatus#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, count#X, sum#X, count#X, sum#X, count#X, count#X], [plan_id=X], [shuffle_writer_type=hash] (9) ShuffleQueryStage Output [17]: [l_returnflag#X, l_linestatus#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, count#X, sum#X, count#X, sum#X, count#X, count#X] @@ -93,7 +93,7 @@ Arguments: X, X (15) ColumnarExchange Input [10]: [l_returnflag#X, l_linestatus#X, sum_qty#X, sum_base_price#X, sum_disc_price#X, sum_charge#X, avg_qty#X, avg_price#X, avg_disc#X, count_order#X] -Arguments: rangepartitioning(l_returnflag#X ASC NULLS FIRST, l_linestatus#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] +Arguments: rangepartitioning(l_returnflag#X ASC NULLS FIRST, l_linestatus#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [shuffle_writer_type=hash] (16) ShuffleQueryStage Output [10]: [l_returnflag#X, l_linestatus#X, sum_qty#X, sum_base_price#X, sum_disc_price#X, sum_charge#X, avg_qty#X, avg_price#X, avg_disc#X, count_order#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark33/10.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark33/10.txt index 35cf0c574fde7..0fd13247bf746 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark33/10.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark33/10.txt @@ -113,7 +113,7 @@ Arguments: X, X (6) ColumnarExchange Input [8]: [hash_partition_key#X, c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X] -Arguments: hashpartitioning(c_custkey#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(c_custkey#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X], [plan_id=X], [shuffle_writer_type=hash] (7) ShuffleQueryStage Output [7]: [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X] @@ -150,7 +150,7 @@ Arguments: X, X (15) ColumnarExchange Input [3]: [hash_partition_key#X, o_orderkey#X, o_custkey#X] -Arguments: hashpartitioning(o_custkey#X, 1), ENSURE_REQUIREMENTS, [o_orderkey#X, o_custkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(o_custkey#X, 1), ENSURE_REQUIREMENTS, [o_orderkey#X, o_custkey#X], [plan_id=X], [shuffle_writer_type=hash] (16) ShuffleQueryStage Output [2]: [o_orderkey#X, o_custkey#X] @@ -181,7 +181,7 @@ Arguments: X, X (23) ColumnarExchange Input [9]: [hash_partition_key#X, c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, o_orderkey#X] -Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, o_orderkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, o_orderkey#X], [plan_id=X], [shuffle_writer_type=hash] (24) ShuffleQueryStage Output [8]: [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, o_orderkey#X] @@ -218,7 +218,7 @@ Arguments: X, X (32) ColumnarExchange Input [4]: [hash_partition_key#X, l_orderkey#X, l_extendedprice#X, l_discount#X] -Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_extendedprice#X, l_discount#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_extendedprice#X, l_discount#X], [plan_id=X], [shuffle_writer_type=hash] (33) ShuffleQueryStage Output [3]: [l_orderkey#X, l_extendedprice#X, l_discount#X] @@ -249,7 +249,7 @@ Arguments: X, X (40) ColumnarExchange Input [10]: [hash_partition_key#X, c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, l_extendedprice#X, l_discount#X] -Arguments: hashpartitioning(c_nationkey#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, l_extendedprice#X, l_discount#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(c_nationkey#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, l_extendedprice#X, l_discount#X], [plan_id=X], [shuffle_writer_type=hash] (41) ShuffleQueryStage Output [9]: [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, l_extendedprice#X, l_discount#X] @@ -286,7 +286,7 @@ Arguments: X, X (49) ColumnarExchange Input [3]: [hash_partition_key#X, n_nationkey#X, n_name#X] -Arguments: hashpartitioning(n_nationkey#X, 1), ENSURE_REQUIREMENTS, [n_nationkey#X, n_name#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(n_nationkey#X, 1), ENSURE_REQUIREMENTS, [n_nationkey#X, n_name#X], [plan_id=X], [shuffle_writer_type=hash] (50) ShuffleQueryStage Output [2]: [n_nationkey#X, n_name#X] @@ -328,7 +328,7 @@ Arguments: X, X (59) ColumnarExchange Input [10]: [hash_partition_key#X, c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, sum#X, isEmpty#X] -Arguments: hashpartitioning(c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, sum#X, isEmpty#X], [plan_id=X], [shuffle_writer_type=hash] (60) ShuffleQueryStage Output [9]: [c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, sum#X, isEmpty#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark33/11.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark33/11.txt index e24c94d48247e..8e8d875244549 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark33/11.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark33/11.txt @@ -97,7 +97,7 @@ Arguments: X, X (6) ColumnarExchange Input [5]: [hash_partition_key#X, ps_partkey#X, ps_suppkey#X, ps_availqty#X, ps_supplycost#X] -Arguments: hashpartitioning(ps_suppkey#X, 1), ENSURE_REQUIREMENTS, [ps_partkey#X, ps_suppkey#X, ps_availqty#X, ps_supplycost#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(ps_suppkey#X, 1), ENSURE_REQUIREMENTS, [ps_partkey#X, ps_suppkey#X, ps_availqty#X, ps_supplycost#X], [plan_id=X], [shuffle_writer_type=hash] (7) ShuffleQueryStage Output [4]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X, ps_supplycost#X] @@ -134,7 +134,7 @@ Arguments: X, X (15) ColumnarExchange Input [3]: [hash_partition_key#X, s_suppkey#X, s_nationkey#X] -Arguments: hashpartitioning(s_suppkey#X, 1), ENSURE_REQUIREMENTS, [s_suppkey#X, s_nationkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(s_suppkey#X, 1), ENSURE_REQUIREMENTS, [s_suppkey#X, s_nationkey#X], [plan_id=X], [shuffle_writer_type=hash] (16) ShuffleQueryStage Output [2]: [s_suppkey#X, s_nationkey#X] @@ -165,7 +165,7 @@ Arguments: X, X (23) ColumnarExchange Input [5]: [hash_partition_key#X, ps_partkey#X, ps_availqty#X, ps_supplycost#X, s_nationkey#X] -Arguments: hashpartitioning(s_nationkey#X, 1), ENSURE_REQUIREMENTS, [ps_partkey#X, ps_availqty#X, ps_supplycost#X, s_nationkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(s_nationkey#X, 1), ENSURE_REQUIREMENTS, [ps_partkey#X, ps_availqty#X, ps_supplycost#X, s_nationkey#X], [plan_id=X], [shuffle_writer_type=hash] (24) ShuffleQueryStage Output [4]: [ps_partkey#X, ps_availqty#X, ps_supplycost#X, s_nationkey#X] @@ -202,7 +202,7 @@ Arguments: X, X (32) ColumnarExchange Input [2]: [hash_partition_key#X, n_nationkey#X] -Arguments: hashpartitioning(n_nationkey#X, 1), ENSURE_REQUIREMENTS, [n_nationkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(n_nationkey#X, 1), ENSURE_REQUIREMENTS, [n_nationkey#X], [plan_id=X], [shuffle_writer_type=hash] (33) ShuffleQueryStage Output [1]: [n_nationkey#X] @@ -244,7 +244,7 @@ Arguments: X, X (42) ColumnarExchange Input [4]: [hash_partition_key#X, ps_partkey#X, sum#X, isEmpty#X] -Arguments: hashpartitioning(ps_partkey#X, 1), ENSURE_REQUIREMENTS, [ps_partkey#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(ps_partkey#X, 1), ENSURE_REQUIREMENTS, [ps_partkey#X, sum#X, isEmpty#X], [plan_id=X], [shuffle_writer_type=hash] (43) ShuffleQueryStage Output [3]: [ps_partkey#X, sum#X, isEmpty#X] @@ -277,7 +277,7 @@ Arguments: X, X (50) ColumnarExchange Input [2]: [ps_partkey#X, value#X] -Arguments: rangepartitioning(value#X DESC NULLS LAST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] +Arguments: rangepartitioning(value#X DESC NULLS LAST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [shuffle_writer_type=hash] (51) ShuffleQueryStage Output [2]: [ps_partkey#X, value#X] @@ -500,7 +500,7 @@ Arguments: X, X (88) ColumnarExchange Input [4]: [hash_partition_key#X, ps_suppkey#X, ps_availqty#X, ps_supplycost#X] -Arguments: hashpartitioning(ps_suppkey#X, 1), ENSURE_REQUIREMENTS, [ps_suppkey#X, ps_availqty#X, ps_supplycost#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(ps_suppkey#X, 1), ENSURE_REQUIREMENTS, [ps_suppkey#X, ps_availqty#X, ps_supplycost#X], [plan_id=X], [shuffle_writer_type=hash] (89) ShuffleQueryStage Output [3]: [ps_suppkey#X, ps_availqty#X, ps_supplycost#X] @@ -544,7 +544,7 @@ Arguments: X, X (100) ColumnarExchange Input [4]: [hash_partition_key#X, ps_availqty#X, ps_supplycost#X, s_nationkey#X] -Arguments: hashpartitioning(s_nationkey#X, 1), ENSURE_REQUIREMENTS, [ps_availqty#X, ps_supplycost#X, s_nationkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(s_nationkey#X, 1), ENSURE_REQUIREMENTS, [ps_availqty#X, ps_supplycost#X, s_nationkey#X], [plan_id=X], [shuffle_writer_type=hash] (101) ShuffleQueryStage Output [3]: [ps_availqty#X, ps_supplycost#X, s_nationkey#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark33/12.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark33/12.txt index 870f9ccebd86c..883294a4f8cc4 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark33/12.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark33/12.txt @@ -74,7 +74,7 @@ Arguments: X, X (6) ColumnarExchange Input [3]: [hash_partition_key#X, o_orderkey#X, o_orderpriority#X] -Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [o_orderkey#X, o_orderpriority#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [o_orderkey#X, o_orderpriority#X], [plan_id=X], [shuffle_writer_type=hash] (7) ShuffleQueryStage Output [2]: [o_orderkey#X, o_orderpriority#X] @@ -111,7 +111,7 @@ Arguments: X, X (15) ColumnarExchange Input [3]: [hash_partition_key#X, l_orderkey#X, l_shipmode#X] -Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_shipmode#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_shipmode#X], [plan_id=X], [shuffle_writer_type=hash] (16) ShuffleQueryStage Output [2]: [l_orderkey#X, l_shipmode#X] @@ -153,7 +153,7 @@ Arguments: X, X (25) ColumnarExchange Input [4]: [hash_partition_key#X, l_shipmode#X, sum#X, sum#X] -Arguments: hashpartitioning(l_shipmode#X, 1), ENSURE_REQUIREMENTS, [l_shipmode#X, sum#X, sum#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(l_shipmode#X, 1), ENSURE_REQUIREMENTS, [l_shipmode#X, sum#X, sum#X], [plan_id=X], [shuffle_writer_type=hash] (26) ShuffleQueryStage Output [3]: [l_shipmode#X, sum#X, sum#X] @@ -182,7 +182,7 @@ Arguments: X, X (32) ColumnarExchange Input [3]: [l_shipmode#X, high_line_count#X, low_line_count#X] -Arguments: rangepartitioning(l_shipmode#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] +Arguments: rangepartitioning(l_shipmode#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [shuffle_writer_type=hash] (33) ShuffleQueryStage Output [3]: [l_shipmode#X, high_line_count#X, low_line_count#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark33/13.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark33/13.txt index ff5bfe2540967..c035c1d4c0609 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark33/13.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark33/13.txt @@ -72,7 +72,7 @@ Arguments: X, X (5) ColumnarExchange Input [2]: [hash_partition_key#X, c_custkey#X] -Arguments: hashpartitioning(c_custkey#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(c_custkey#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X], [plan_id=X], [shuffle_writer_type=hash] (6) ShuffleQueryStage Output [1]: [c_custkey#X] @@ -109,7 +109,7 @@ Arguments: X, X (14) ColumnarExchange Input [3]: [hash_partition_key#X, o_orderkey#X, o_custkey#X] -Arguments: hashpartitioning(o_custkey#X, 1), ENSURE_REQUIREMENTS, [o_orderkey#X, o_custkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(o_custkey#X, 1), ENSURE_REQUIREMENTS, [o_orderkey#X, o_custkey#X], [plan_id=X], [shuffle_writer_type=hash] (15) ShuffleQueryStage Output [2]: [o_orderkey#X, o_custkey#X] @@ -169,7 +169,7 @@ Arguments: X, X (27) ColumnarExchange Input [3]: [hash_partition_key#X, c_count#X, count#X] -Arguments: hashpartitioning(c_count#X, 1), ENSURE_REQUIREMENTS, [c_count#X, count#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(c_count#X, 1), ENSURE_REQUIREMENTS, [c_count#X, count#X], [plan_id=X], [shuffle_writer_type=hash] (28) ShuffleQueryStage Output [2]: [c_count#X, count#X] @@ -198,7 +198,7 @@ Arguments: X, X (34) ColumnarExchange Input [2]: [c_count#X, custdist#X] -Arguments: rangepartitioning(custdist#X DESC NULLS LAST, c_count#X DESC NULLS LAST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] +Arguments: rangepartitioning(custdist#X DESC NULLS LAST, c_count#X DESC NULLS LAST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [shuffle_writer_type=hash] (35) ShuffleQueryStage Output [2]: [c_count#X, custdist#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark33/14.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark33/14.txt index 247b15e33ba3d..bf6acad56fea3 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark33/14.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark33/14.txt @@ -62,7 +62,7 @@ Arguments: X, X (6) ColumnarExchange Input [4]: [hash_partition_key#X, l_partkey#X, l_extendedprice#X, l_discount#X] -Arguments: hashpartitioning(l_partkey#X, 1), ENSURE_REQUIREMENTS, [l_partkey#X, l_extendedprice#X, l_discount#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(l_partkey#X, 1), ENSURE_REQUIREMENTS, [l_partkey#X, l_extendedprice#X, l_discount#X], [plan_id=X], [shuffle_writer_type=hash] (7) ShuffleQueryStage Output [3]: [l_partkey#X, l_extendedprice#X, l_discount#X] @@ -99,7 +99,7 @@ Arguments: X, X (15) ColumnarExchange Input [3]: [hash_partition_key#X, p_partkey#X, p_type#X] -Arguments: hashpartitioning(p_partkey#X, 1), ENSURE_REQUIREMENTS, [p_partkey#X, p_type#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(p_partkey#X, 1), ENSURE_REQUIREMENTS, [p_partkey#X, p_type#X], [plan_id=X], [shuffle_writer_type=hash] (16) ShuffleQueryStage Output [2]: [p_partkey#X, p_type#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark33/15.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark33/15.txt index ecb61718853ca..9f0f6936c0e85 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark33/15.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark33/15.txt @@ -70,7 +70,7 @@ Arguments: X, X (6) ColumnarExchange Input [5]: [hash_partition_key#X, s_suppkey#X, s_name#X, s_address#X, s_phone#X] -Arguments: hashpartitioning(s_suppkey#X, 1), ENSURE_REQUIREMENTS, [s_suppkey#X, s_name#X, s_address#X, s_phone#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(s_suppkey#X, 1), ENSURE_REQUIREMENTS, [s_suppkey#X, s_name#X, s_address#X, s_phone#X], [plan_id=X], [shuffle_writer_type=hash] (7) ShuffleQueryStage Output [4]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X] @@ -118,7 +118,7 @@ Arguments: X, X (17) ColumnarExchange Input [4]: [hash_partition_key#X, l_suppkey#X, sum#X, isEmpty#X] -Arguments: hashpartitioning(l_suppkey#X, 1), ENSURE_REQUIREMENTS, [l_suppkey#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(l_suppkey#X, 1), ENSURE_REQUIREMENTS, [l_suppkey#X, sum#X, isEmpty#X], [plan_id=X], [shuffle_writer_type=hash] (18) ShuffleQueryStage Output [3]: [l_suppkey#X, sum#X, isEmpty#X] @@ -160,7 +160,7 @@ Arguments: X, X (27) ColumnarExchange Input [5]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X, total_revenue#X] -Arguments: rangepartitioning(s_suppkey#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] +Arguments: rangepartitioning(s_suppkey#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [shuffle_writer_type=hash] (28) ShuffleQueryStage Output [5]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X, total_revenue#X] @@ -320,7 +320,7 @@ Arguments: X, X (55) ColumnarExchange Input [4]: [hash_partition_key#X, l_suppkey#X, sum#X, isEmpty#X] -Arguments: hashpartitioning(l_suppkey#X, 1), ENSURE_REQUIREMENTS, [l_suppkey#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(l_suppkey#X, 1), ENSURE_REQUIREMENTS, [l_suppkey#X, sum#X, isEmpty#X], [plan_id=X], [shuffle_writer_type=hash] (56) ShuffleQueryStage Output [3]: [l_suppkey#X, sum#X, isEmpty#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark33/16.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark33/16.txt index 50a27d401534d..0490704a3ac09 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark33/16.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark33/16.txt @@ -88,7 +88,7 @@ Arguments: X, X (6) ColumnarExchange Input [3]: [hash_partition_key#X, ps_partkey#X, ps_suppkey#X] -Arguments: hashpartitioning(ps_partkey#X, 1), ENSURE_REQUIREMENTS, [ps_partkey#X, ps_suppkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(ps_partkey#X, 1), ENSURE_REQUIREMENTS, [ps_partkey#X, ps_suppkey#X], [plan_id=X], [shuffle_writer_type=hash] (7) ShuffleQueryStage Output [2]: [ps_partkey#X, ps_suppkey#X] @@ -125,7 +125,7 @@ Arguments: X, X (15) ColumnarExchange Input [5]: [hash_partition_key#X, p_partkey#X, p_brand#X, p_type#X, p_size#X] -Arguments: hashpartitioning(p_partkey#X, 1), ENSURE_REQUIREMENTS, [p_partkey#X, p_brand#X, p_type#X, p_size#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(p_partkey#X, 1), ENSURE_REQUIREMENTS, [p_partkey#X, p_brand#X, p_type#X, p_size#X], [plan_id=X], [shuffle_writer_type=hash] (16) ShuffleQueryStage Output [4]: [p_partkey#X, p_brand#X, p_type#X, p_size#X] @@ -167,7 +167,7 @@ Arguments: X, X (25) ColumnarExchange Input [5]: [hash_partition_key#X, p_brand#X, p_type#X, p_size#X, ps_suppkey#X] -Arguments: hashpartitioning(p_brand#X, p_type#X, p_size#X, ps_suppkey#X, 1), ENSURE_REQUIREMENTS, [p_brand#X, p_type#X, p_size#X, ps_suppkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(p_brand#X, p_type#X, p_size#X, ps_suppkey#X, 1), ENSURE_REQUIREMENTS, [p_brand#X, p_type#X, p_size#X, ps_suppkey#X], [plan_id=X], [shuffle_writer_type=hash] (26) ShuffleQueryStage Output [4]: [p_brand#X, p_type#X, p_size#X, ps_suppkey#X] @@ -207,7 +207,7 @@ Arguments: X, X (34) ColumnarExchange Input [5]: [hash_partition_key#X, p_brand#X, p_type#X, p_size#X, count#X] -Arguments: hashpartitioning(p_brand#X, p_type#X, p_size#X, 1), ENSURE_REQUIREMENTS, [p_brand#X, p_type#X, p_size#X, count#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(p_brand#X, p_type#X, p_size#X, 1), ENSURE_REQUIREMENTS, [p_brand#X, p_type#X, p_size#X, count#X], [plan_id=X], [shuffle_writer_type=hash] (35) ShuffleQueryStage Output [4]: [p_brand#X, p_type#X, p_size#X, count#X] @@ -236,7 +236,7 @@ Arguments: X, X (41) ColumnarExchange Input [4]: [p_brand#X, p_type#X, p_size#X, supplier_cnt#X] -Arguments: rangepartitioning(supplier_cnt#X DESC NULLS LAST, p_brand#X ASC NULLS FIRST, p_type#X ASC NULLS FIRST, p_size#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] +Arguments: rangepartitioning(supplier_cnt#X DESC NULLS LAST, p_brand#X ASC NULLS FIRST, p_type#X ASC NULLS FIRST, p_size#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [shuffle_writer_type=hash] (42) ShuffleQueryStage Output [4]: [p_brand#X, p_type#X, p_size#X, supplier_cnt#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark33/17.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark33/17.txt index e3596c36ce389..8aeec8a4ecb4e 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark33/17.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark33/17.txt @@ -84,7 +84,7 @@ Arguments: X, X (6) ColumnarExchange Input [4]: [hash_partition_key#X, l_partkey#X, l_quantity#X, l_extendedprice#X] -Arguments: hashpartitioning(l_partkey#X, 1), ENSURE_REQUIREMENTS, [l_partkey#X, l_quantity#X, l_extendedprice#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(l_partkey#X, 1), ENSURE_REQUIREMENTS, [l_partkey#X, l_quantity#X, l_extendedprice#X], [plan_id=X], [shuffle_writer_type=hash] (7) ShuffleQueryStage Output [3]: [l_partkey#X, l_quantity#X, l_extendedprice#X] @@ -121,7 +121,7 @@ Arguments: X, X (15) ColumnarExchange Input [2]: [hash_partition_key#X, p_partkey#X] -Arguments: hashpartitioning(p_partkey#X, 1), ENSURE_REQUIREMENTS, [p_partkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(p_partkey#X, 1), ENSURE_REQUIREMENTS, [p_partkey#X], [plan_id=X], [shuffle_writer_type=hash] (16) ShuffleQueryStage Output [1]: [p_partkey#X] @@ -174,7 +174,7 @@ Arguments: X, X (27) ColumnarExchange Input [4]: [hash_partition_key#X, l_partkey#X, sum#X, count#X] -Arguments: hashpartitioning(l_partkey#X, 1), ENSURE_REQUIREMENTS, [l_partkey#X, sum#X, count#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(l_partkey#X, 1), ENSURE_REQUIREMENTS, [l_partkey#X, sum#X, count#X], [plan_id=X], [shuffle_writer_type=hash] (28) ShuffleQueryStage Output [3]: [l_partkey#X, sum#X, count#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark33/18.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark33/18.txt index e84c68a092d6b..95c531a92eba3 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark33/18.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark33/18.txt @@ -124,7 +124,7 @@ Arguments: X, X (6) ColumnarExchange Input [3]: [hash_partition_key#X, c_custkey#X, c_name#X] -Arguments: hashpartitioning(c_custkey#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X, c_name#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(c_custkey#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X, c_name#X], [plan_id=X], [shuffle_writer_type=hash] (7) ShuffleQueryStage Output [2]: [c_custkey#X, c_name#X] @@ -161,7 +161,7 @@ Arguments: X, X (15) ColumnarExchange Input [5]: [hash_partition_key#X, o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X] -Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X], [plan_id=X], [shuffle_writer_type=hash] (16) ShuffleQueryStage Output [4]: [o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X] @@ -200,7 +200,7 @@ Arguments: X, X (24) ColumnarExchange Input [4]: [hash_partition_key#X, l_orderkey#X, sum#X, isEmpty#X] -Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, sum#X, isEmpty#X], [plan_id=X], [shuffle_writer_type=hash] (25) ShuffleQueryStage Output [3]: [l_orderkey#X, sum#X, isEmpty#X] @@ -246,7 +246,7 @@ Arguments: X, X (35) ColumnarExchange Input [5]: [hash_partition_key#X, o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X] -Arguments: hashpartitioning(o_custkey#X, 1), ENSURE_REQUIREMENTS, [o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(o_custkey#X, 1), ENSURE_REQUIREMENTS, [o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X], [plan_id=X], [shuffle_writer_type=hash] (36) ShuffleQueryStage Output [4]: [o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X] @@ -277,7 +277,7 @@ Arguments: X, X (43) ColumnarExchange Input [6]: [hash_partition_key#X, c_custkey#X, c_name#X, o_orderkey#X, o_totalprice#X, o_orderdate#X] -Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X, c_name#X, o_orderkey#X, o_totalprice#X, o_orderdate#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X, c_name#X, o_orderkey#X, o_totalprice#X, o_orderdate#X], [plan_id=X], [shuffle_writer_type=hash] (44) ShuffleQueryStage Output [5]: [c_custkey#X, c_name#X, o_orderkey#X, o_totalprice#X, o_orderdate#X] @@ -314,7 +314,7 @@ Arguments: X, X (52) ColumnarExchange Input [3]: [hash_partition_key#X, l_orderkey#X, l_quantity#X] -Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_quantity#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_quantity#X], [plan_id=X], [shuffle_writer_type=hash] (53) ShuffleQueryStage Output [2]: [l_orderkey#X, l_quantity#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark33/19.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark33/19.txt index ec373f6a4c881..abcc6b67af9bf 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark33/19.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark33/19.txt @@ -61,7 +61,7 @@ Arguments: X, X (6) ColumnarExchange Input [5]: [hash_partition_key#X, l_partkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] -Arguments: hashpartitioning(l_partkey#X, 1), ENSURE_REQUIREMENTS, [l_partkey#X, l_quantity#X, l_extendedprice#X, l_discount#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(l_partkey#X, 1), ENSURE_REQUIREMENTS, [l_partkey#X, l_quantity#X, l_extendedprice#X, l_discount#X], [plan_id=X], [shuffle_writer_type=hash] (7) ShuffleQueryStage Output [4]: [l_partkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] @@ -98,7 +98,7 @@ Arguments: X, X (15) ColumnarExchange Input [5]: [hash_partition_key#X, p_partkey#X, p_brand#X, p_size#X, p_container#X] -Arguments: hashpartitioning(p_partkey#X, 1), ENSURE_REQUIREMENTS, [p_partkey#X, p_brand#X, p_size#X, p_container#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(p_partkey#X, 1), ENSURE_REQUIREMENTS, [p_partkey#X, p_brand#X, p_size#X, p_container#X], [plan_id=X], [shuffle_writer_type=hash] (16) ShuffleQueryStage Output [4]: [p_partkey#X, p_brand#X, p_size#X, p_container#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark33/20.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark33/20.txt index bb3a76ec6bb6d..365ccaf207606 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark33/20.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark33/20.txt @@ -151,7 +151,7 @@ Arguments: X, X (6) ColumnarExchange Input [5]: [hash_partition_key#X, s_suppkey#X, s_name#X, s_address#X, s_nationkey#X] -Arguments: hashpartitioning(s_suppkey#X, 1), ENSURE_REQUIREMENTS, [s_suppkey#X, s_name#X, s_address#X, s_nationkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(s_suppkey#X, 1), ENSURE_REQUIREMENTS, [s_suppkey#X, s_name#X, s_address#X, s_nationkey#X], [plan_id=X], [shuffle_writer_type=hash] (7) ShuffleQueryStage Output [4]: [s_suppkey#X, s_name#X, s_address#X, s_nationkey#X] @@ -188,7 +188,7 @@ Arguments: X, X (15) ColumnarExchange Input [4]: [hash_partition_key#X, ps_partkey#X, ps_suppkey#X, ps_availqty#X] -Arguments: hashpartitioning(ps_partkey#X, 1), ENSURE_REQUIREMENTS, [ps_partkey#X, ps_suppkey#X, ps_availqty#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(ps_partkey#X, 1), ENSURE_REQUIREMENTS, [ps_partkey#X, ps_suppkey#X, ps_availqty#X], [plan_id=X], [shuffle_writer_type=hash] (16) ShuffleQueryStage Output [3]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X] @@ -225,7 +225,7 @@ Arguments: X, X (24) ColumnarExchange Input [2]: [hash_partition_key#X, p_partkey#X] -Arguments: hashpartitioning(p_partkey#X, 1), ENSURE_REQUIREMENTS, [p_partkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(p_partkey#X, 1), ENSURE_REQUIREMENTS, [p_partkey#X], [plan_id=X], [shuffle_writer_type=hash] (25) ShuffleQueryStage Output [1]: [p_partkey#X] @@ -256,7 +256,7 @@ Arguments: X, X (32) ColumnarExchange Input [4]: [hash_partition_key#X, ps_partkey#X, ps_suppkey#X, ps_availqty#X] -Arguments: hashpartitioning(ps_partkey#X, ps_suppkey#X, 1), ENSURE_REQUIREMENTS, [ps_partkey#X, ps_suppkey#X, ps_availqty#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(ps_partkey#X, ps_suppkey#X, 1), ENSURE_REQUIREMENTS, [ps_partkey#X, ps_suppkey#X, ps_availqty#X], [plan_id=X], [shuffle_writer_type=hash] (33) ShuffleQueryStage Output [3]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X] @@ -293,7 +293,7 @@ Arguments: X, X (41) ColumnarExchange Input [4]: [hash_partition_key#X, l_partkey#X, l_suppkey#X, l_quantity#X] -Arguments: hashpartitioning(l_partkey#X, 1), ENSURE_REQUIREMENTS, [l_partkey#X, l_suppkey#X, l_quantity#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(l_partkey#X, 1), ENSURE_REQUIREMENTS, [l_partkey#X, l_suppkey#X, l_quantity#X], [plan_id=X], [shuffle_writer_type=hash] (42) ShuffleQueryStage Output [3]: [l_partkey#X, l_suppkey#X, l_quantity#X] @@ -359,7 +359,7 @@ Arguments: X, X (57) ColumnarExchange Input [4]: [hash_partition_key#X, (0.5 * sum(l_quantity))#X, l_partkey#X, l_suppkey#X] -Arguments: hashpartitioning(l_partkey#X, l_suppkey#X, 1), ENSURE_REQUIREMENTS, [(0.5 * sum(l_quantity))#X, l_partkey#X, l_suppkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(l_partkey#X, l_suppkey#X, 1), ENSURE_REQUIREMENTS, [(0.5 * sum(l_quantity))#X, l_partkey#X, l_suppkey#X], [plan_id=X], [shuffle_writer_type=hash] (58) ShuffleQueryStage Output [3]: [(0.5 * sum(l_quantity))#X, l_partkey#X, l_suppkey#X] @@ -390,7 +390,7 @@ Arguments: X, X (65) ColumnarExchange Input [2]: [hash_partition_key#X, ps_suppkey#X] -Arguments: hashpartitioning(ps_suppkey#X, 1), ENSURE_REQUIREMENTS, [ps_suppkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(ps_suppkey#X, 1), ENSURE_REQUIREMENTS, [ps_suppkey#X], [plan_id=X], [shuffle_writer_type=hash] (66) ShuffleQueryStage Output [1]: [ps_suppkey#X] @@ -421,7 +421,7 @@ Arguments: X, X (73) ColumnarExchange Input [4]: [hash_partition_key#X, s_name#X, s_address#X, s_nationkey#X] -Arguments: hashpartitioning(s_nationkey#X, 1), ENSURE_REQUIREMENTS, [s_name#X, s_address#X, s_nationkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(s_nationkey#X, 1), ENSURE_REQUIREMENTS, [s_name#X, s_address#X, s_nationkey#X], [plan_id=X], [shuffle_writer_type=hash] (74) ShuffleQueryStage Output [3]: [s_name#X, s_address#X, s_nationkey#X] @@ -458,7 +458,7 @@ Arguments: X, X (82) ColumnarExchange Input [2]: [hash_partition_key#X, n_nationkey#X] -Arguments: hashpartitioning(n_nationkey#X, 1), ENSURE_REQUIREMENTS, [n_nationkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(n_nationkey#X, 1), ENSURE_REQUIREMENTS, [n_nationkey#X], [plan_id=X], [shuffle_writer_type=hash] (83) ShuffleQueryStage Output [1]: [n_nationkey#X] @@ -489,7 +489,7 @@ Arguments: X, X (90) ColumnarExchange Input [2]: [s_name#X, s_address#X] -Arguments: rangepartitioning(s_name#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] +Arguments: rangepartitioning(s_name#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [shuffle_writer_type=hash] (91) ShuffleQueryStage Output [2]: [s_name#X, s_address#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark33/21.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark33/21.txt index 428acf4c91224..aedeb8ce0a315 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark33/21.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark33/21.txt @@ -144,7 +144,7 @@ Arguments: X, X (6) ColumnarExchange Input [4]: [hash_partition_key#X, s_suppkey#X, s_name#X, s_nationkey#X] -Arguments: hashpartitioning(s_suppkey#X, 1), ENSURE_REQUIREMENTS, [s_suppkey#X, s_name#X, s_nationkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(s_suppkey#X, 1), ENSURE_REQUIREMENTS, [s_suppkey#X, s_name#X, s_nationkey#X], [plan_id=X], [shuffle_writer_type=hash] (7) ShuffleQueryStage Output [3]: [s_suppkey#X, s_name#X, s_nationkey#X] @@ -181,7 +181,7 @@ Arguments: X, X (15) ColumnarExchange Input [3]: [hash_partition_key#X, l_orderkey#X, l_suppkey#X] -Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_suppkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_suppkey#X], [plan_id=X], [shuffle_writer_type=hash] (16) ShuffleQueryStage Output [2]: [l_orderkey#X, l_suppkey#X] @@ -213,7 +213,7 @@ Arguments: X, X (23) ColumnarExchange Input [3]: [hash_partition_key#X, l_orderkey#X, l_suppkey#X] -Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_suppkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_suppkey#X], [plan_id=X], [shuffle_writer_type=hash] (24) ShuffleQueryStage Output [2]: [l_orderkey#X, l_suppkey#X] @@ -255,7 +255,7 @@ Arguments: X, X (33) ColumnarExchange Input [3]: [hash_partition_key#X, l_orderkey#X, l_suppkey#X] -Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_suppkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_suppkey#X], [plan_id=X], [shuffle_writer_type=hash] (34) ShuffleQueryStage Output [2]: [l_orderkey#X, l_suppkey#X] @@ -286,7 +286,7 @@ Arguments: X, X (41) ColumnarExchange Input [3]: [hash_partition_key#X, l_orderkey#X, l_suppkey#X] -Arguments: hashpartitioning(l_suppkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_suppkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(l_suppkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_suppkey#X], [plan_id=X], [shuffle_writer_type=hash] (42) ShuffleQueryStage Output [2]: [l_orderkey#X, l_suppkey#X] @@ -317,7 +317,7 @@ Arguments: X, X (49) ColumnarExchange Input [4]: [hash_partition_key#X, s_name#X, s_nationkey#X, l_orderkey#X] -Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [s_name#X, s_nationkey#X, l_orderkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [s_name#X, s_nationkey#X, l_orderkey#X], [plan_id=X], [shuffle_writer_type=hash] (50) ShuffleQueryStage Output [3]: [s_name#X, s_nationkey#X, l_orderkey#X] @@ -354,7 +354,7 @@ Arguments: X, X (58) ColumnarExchange Input [2]: [hash_partition_key#X, o_orderkey#X] -Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [o_orderkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [o_orderkey#X], [plan_id=X], [shuffle_writer_type=hash] (59) ShuffleQueryStage Output [1]: [o_orderkey#X] @@ -385,7 +385,7 @@ Arguments: X, X (66) ColumnarExchange Input [3]: [hash_partition_key#X, s_name#X, s_nationkey#X] -Arguments: hashpartitioning(s_nationkey#X, 1), ENSURE_REQUIREMENTS, [s_name#X, s_nationkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(s_nationkey#X, 1), ENSURE_REQUIREMENTS, [s_name#X, s_nationkey#X], [plan_id=X], [shuffle_writer_type=hash] (67) ShuffleQueryStage Output [2]: [s_name#X, s_nationkey#X] @@ -422,7 +422,7 @@ Arguments: X, X (75) ColumnarExchange Input [2]: [hash_partition_key#X, n_nationkey#X] -Arguments: hashpartitioning(n_nationkey#X, 1), ENSURE_REQUIREMENTS, [n_nationkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(n_nationkey#X, 1), ENSURE_REQUIREMENTS, [n_nationkey#X], [plan_id=X], [shuffle_writer_type=hash] (76) ShuffleQueryStage Output [1]: [n_nationkey#X] @@ -464,7 +464,7 @@ Arguments: X, X (85) ColumnarExchange Input [3]: [hash_partition_key#X, s_name#X, count#X] -Arguments: hashpartitioning(s_name#X, 1), ENSURE_REQUIREMENTS, [s_name#X, count#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(s_name#X, 1), ENSURE_REQUIREMENTS, [s_name#X, count#X], [plan_id=X], [shuffle_writer_type=hash] (86) ShuffleQueryStage Output [2]: [s_name#X, count#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark33/22.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark33/22.txt index 86272c2a803ea..7a51c8367a7ee 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark33/22.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark33/22.txt @@ -71,7 +71,7 @@ Arguments: X, X (6) ColumnarExchange Input [4]: [hash_partition_key#X, c_custkey#X, c_phone#X, c_acctbal#X] -Arguments: hashpartitioning(c_custkey#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X, c_phone#X, c_acctbal#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(c_custkey#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X, c_phone#X, c_acctbal#X], [plan_id=X], [shuffle_writer_type=hash] (7) ShuffleQueryStage Output [3]: [c_custkey#X, c_phone#X, c_acctbal#X] @@ -103,7 +103,7 @@ Arguments: X, X (14) ColumnarExchange Input [2]: [hash_partition_key#X, o_custkey#X] -Arguments: hashpartitioning(o_custkey#X, 1), ENSURE_REQUIREMENTS, [o_custkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(o_custkey#X, 1), ENSURE_REQUIREMENTS, [o_custkey#X], [plan_id=X], [shuffle_writer_type=hash] (15) ShuffleQueryStage Output [1]: [o_custkey#X] @@ -145,7 +145,7 @@ Arguments: X, X (24) ColumnarExchange Input [5]: [hash_partition_key#X, cntrycode#X, count#X, sum#X, isEmpty#X] -Arguments: hashpartitioning(cntrycode#X, 1), ENSURE_REQUIREMENTS, [cntrycode#X, count#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(cntrycode#X, 1), ENSURE_REQUIREMENTS, [cntrycode#X, count#X, sum#X, isEmpty#X], [plan_id=X], [shuffle_writer_type=hash] (25) ShuffleQueryStage Output [4]: [cntrycode#X, count#X, sum#X, isEmpty#X] @@ -174,7 +174,7 @@ Arguments: X, X (31) ColumnarExchange Input [3]: [cntrycode#X, numcust#X, totacctbal#X] -Arguments: rangepartitioning(cntrycode#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] +Arguments: rangepartitioning(cntrycode#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [shuffle_writer_type=hash] (32) ShuffleQueryStage Output [3]: [cntrycode#X, numcust#X, totacctbal#X] @@ -325,7 +325,7 @@ Arguments: X, X (59) ColumnarExchange Input [2]: [sum#X, count#X] -Arguments: SinglePartition, ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] +Arguments: SinglePartition, ENSURE_REQUIREMENTS, [plan_id=X], [shuffle_writer_type=hash] (60) ShuffleQueryStage Output [2]: [sum#X, count#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark33/3.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark33/3.txt index 40f7aeb35749b..b9304b4b971d9 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark33/3.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark33/3.txt @@ -86,7 +86,7 @@ Arguments: X, X (6) ColumnarExchange Input [2]: [hash_partition_key#X, c_custkey#X] -Arguments: hashpartitioning(c_custkey#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(c_custkey#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X], [plan_id=X], [shuffle_writer_type=hash] (7) ShuffleQueryStage Output [1]: [c_custkey#X] @@ -123,7 +123,7 @@ Arguments: X, X (15) ColumnarExchange Input [5]: [hash_partition_key#X, o_orderkey#X, o_custkey#X, o_orderdate#X, o_shippriority#X] -Arguments: hashpartitioning(o_custkey#X, 1), ENSURE_REQUIREMENTS, [o_orderkey#X, o_custkey#X, o_orderdate#X, o_shippriority#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(o_custkey#X, 1), ENSURE_REQUIREMENTS, [o_orderkey#X, o_custkey#X, o_orderdate#X, o_shippriority#X], [plan_id=X], [shuffle_writer_type=hash] (16) ShuffleQueryStage Output [4]: [o_orderkey#X, o_custkey#X, o_orderdate#X, o_shippriority#X] @@ -154,7 +154,7 @@ Arguments: X, X (23) ColumnarExchange Input [4]: [hash_partition_key#X, o_orderkey#X, o_orderdate#X, o_shippriority#X] -Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [o_orderkey#X, o_orderdate#X, o_shippriority#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [o_orderkey#X, o_orderdate#X, o_shippriority#X], [plan_id=X], [shuffle_writer_type=hash] (24) ShuffleQueryStage Output [3]: [o_orderkey#X, o_orderdate#X, o_shippriority#X] @@ -191,7 +191,7 @@ Arguments: X, X (32) ColumnarExchange Input [4]: [hash_partition_key#X, l_orderkey#X, l_extendedprice#X, l_discount#X] -Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_extendedprice#X, l_discount#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_extendedprice#X, l_discount#X], [plan_id=X], [shuffle_writer_type=hash] (33) ShuffleQueryStage Output [3]: [l_orderkey#X, l_extendedprice#X, l_discount#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark33/4.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark33/4.txt index eb6a3e27f9f39..32081ef1cb06a 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark33/4.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark33/4.txt @@ -75,7 +75,7 @@ Arguments: X, X (6) ColumnarExchange Input [3]: [hash_partition_key#X, o_orderkey#X, o_orderpriority#X] -Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [o_orderkey#X, o_orderpriority#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [o_orderkey#X, o_orderpriority#X], [plan_id=X], [shuffle_writer_type=hash] (7) ShuffleQueryStage Output [2]: [o_orderkey#X, o_orderpriority#X] @@ -112,7 +112,7 @@ Arguments: X, X (15) ColumnarExchange Input [2]: [hash_partition_key#X, l_orderkey#X] -Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X], [plan_id=X], [shuffle_writer_type=hash] (16) ShuffleQueryStage Output [1]: [l_orderkey#X] @@ -154,7 +154,7 @@ Arguments: X, X (25) ColumnarExchange Input [3]: [hash_partition_key#X, o_orderpriority#X, count#X] -Arguments: hashpartitioning(o_orderpriority#X, 1), ENSURE_REQUIREMENTS, [o_orderpriority#X, count#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(o_orderpriority#X, 1), ENSURE_REQUIREMENTS, [o_orderpriority#X, count#X], [plan_id=X], [shuffle_writer_type=hash] (26) ShuffleQueryStage Output [2]: [o_orderpriority#X, count#X] @@ -183,7 +183,7 @@ Arguments: X, X (32) ColumnarExchange Input [2]: [o_orderpriority#X, order_count#X] -Arguments: rangepartitioning(o_orderpriority#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] +Arguments: rangepartitioning(o_orderpriority#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [shuffle_writer_type=hash] (33) ShuffleQueryStage Output [2]: [o_orderpriority#X, order_count#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark33/5.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark33/5.txt index 20d0aa276e59b..0320703d520c7 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark33/5.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark33/5.txt @@ -159,7 +159,7 @@ Arguments: X, X (6) ColumnarExchange Input [3]: [hash_partition_key#X, c_custkey#X, c_nationkey#X] -Arguments: hashpartitioning(c_custkey#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X, c_nationkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(c_custkey#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X, c_nationkey#X], [plan_id=X], [shuffle_writer_type=hash] (7) ShuffleQueryStage Output [2]: [c_custkey#X, c_nationkey#X] @@ -196,7 +196,7 @@ Arguments: X, X (15) ColumnarExchange Input [3]: [hash_partition_key#X, o_orderkey#X, o_custkey#X] -Arguments: hashpartitioning(o_custkey#X, 1), ENSURE_REQUIREMENTS, [o_orderkey#X, o_custkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(o_custkey#X, 1), ENSURE_REQUIREMENTS, [o_orderkey#X, o_custkey#X], [plan_id=X], [shuffle_writer_type=hash] (16) ShuffleQueryStage Output [2]: [o_orderkey#X, o_custkey#X] @@ -227,7 +227,7 @@ Arguments: X, X (23) ColumnarExchange Input [3]: [hash_partition_key#X, c_nationkey#X, o_orderkey#X] -Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [c_nationkey#X, o_orderkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [c_nationkey#X, o_orderkey#X], [plan_id=X], [shuffle_writer_type=hash] (24) ShuffleQueryStage Output [2]: [c_nationkey#X, o_orderkey#X] @@ -264,7 +264,7 @@ Arguments: X, X (32) ColumnarExchange Input [5]: [hash_partition_key#X, l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] -Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X], [plan_id=X], [shuffle_writer_type=hash] (33) ShuffleQueryStage Output [4]: [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] @@ -295,7 +295,7 @@ Arguments: X, X (40) ColumnarExchange Input [5]: [hash_partition_key#X, c_nationkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] -Arguments: hashpartitioning(l_suppkey#X, c_nationkey#X, 1), ENSURE_REQUIREMENTS, [c_nationkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(l_suppkey#X, c_nationkey#X, 1), ENSURE_REQUIREMENTS, [c_nationkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X], [plan_id=X], [shuffle_writer_type=hash] (41) ShuffleQueryStage Output [4]: [c_nationkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] @@ -332,7 +332,7 @@ Arguments: X, X (49) ColumnarExchange Input [3]: [hash_partition_key#X, s_suppkey#X, s_nationkey#X] -Arguments: hashpartitioning(s_suppkey#X, s_nationkey#X, 1), ENSURE_REQUIREMENTS, [s_suppkey#X, s_nationkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(s_suppkey#X, s_nationkey#X, 1), ENSURE_REQUIREMENTS, [s_suppkey#X, s_nationkey#X], [plan_id=X], [shuffle_writer_type=hash] (50) ShuffleQueryStage Output [2]: [s_suppkey#X, s_nationkey#X] @@ -363,7 +363,7 @@ Arguments: X, X (57) ColumnarExchange Input [4]: [hash_partition_key#X, l_extendedprice#X, l_discount#X, s_nationkey#X] -Arguments: hashpartitioning(s_nationkey#X, 1), ENSURE_REQUIREMENTS, [l_extendedprice#X, l_discount#X, s_nationkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(s_nationkey#X, 1), ENSURE_REQUIREMENTS, [l_extendedprice#X, l_discount#X, s_nationkey#X], [plan_id=X], [shuffle_writer_type=hash] (58) ShuffleQueryStage Output [3]: [l_extendedprice#X, l_discount#X, s_nationkey#X] @@ -400,7 +400,7 @@ Arguments: X, X (66) ColumnarExchange Input [4]: [hash_partition_key#X, n_nationkey#X, n_name#X, n_regionkey#X] -Arguments: hashpartitioning(n_nationkey#X, 1), ENSURE_REQUIREMENTS, [n_nationkey#X, n_name#X, n_regionkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(n_nationkey#X, 1), ENSURE_REQUIREMENTS, [n_nationkey#X, n_name#X, n_regionkey#X], [plan_id=X], [shuffle_writer_type=hash] (67) ShuffleQueryStage Output [3]: [n_nationkey#X, n_name#X, n_regionkey#X] @@ -431,7 +431,7 @@ Arguments: X, X (74) ColumnarExchange Input [5]: [hash_partition_key#X, l_extendedprice#X, l_discount#X, n_name#X, n_regionkey#X] -Arguments: hashpartitioning(n_regionkey#X, 1), ENSURE_REQUIREMENTS, [l_extendedprice#X, l_discount#X, n_name#X, n_regionkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(n_regionkey#X, 1), ENSURE_REQUIREMENTS, [l_extendedprice#X, l_discount#X, n_name#X, n_regionkey#X], [plan_id=X], [shuffle_writer_type=hash] (75) ShuffleQueryStage Output [4]: [l_extendedprice#X, l_discount#X, n_name#X, n_regionkey#X] @@ -468,7 +468,7 @@ Arguments: X, X (83) ColumnarExchange Input [2]: [hash_partition_key#X, r_regionkey#X] -Arguments: hashpartitioning(r_regionkey#X, 1), ENSURE_REQUIREMENTS, [r_regionkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(r_regionkey#X, 1), ENSURE_REQUIREMENTS, [r_regionkey#X], [plan_id=X], [shuffle_writer_type=hash] (84) ShuffleQueryStage Output [1]: [r_regionkey#X] @@ -510,7 +510,7 @@ Arguments: X, X (93) ColumnarExchange Input [4]: [hash_partition_key#X, n_name#X, sum#X, isEmpty#X] -Arguments: hashpartitioning(n_name#X, 1), ENSURE_REQUIREMENTS, [n_name#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(n_name#X, 1), ENSURE_REQUIREMENTS, [n_name#X, sum#X, isEmpty#X], [plan_id=X], [shuffle_writer_type=hash] (94) ShuffleQueryStage Output [3]: [n_name#X, sum#X, isEmpty#X] @@ -539,7 +539,7 @@ Arguments: X, X (100) ColumnarExchange Input [2]: [n_name#X, revenue#X] -Arguments: rangepartitioning(revenue#X DESC NULLS LAST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] +Arguments: rangepartitioning(revenue#X DESC NULLS LAST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [shuffle_writer_type=hash] (101) ShuffleQueryStage Output [2]: [n_name#X, revenue#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark33/6.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark33/6.txt index 1aaed506d7e08..ee04d2159020d 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark33/6.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark33/6.txt @@ -52,7 +52,7 @@ Arguments: X, X (7) ColumnarExchange Input [2]: [sum#X, isEmpty#X] -Arguments: SinglePartition, ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] +Arguments: SinglePartition, ENSURE_REQUIREMENTS, [plan_id=X], [shuffle_writer_type=hash] (8) ShuffleQueryStage Output [2]: [sum#X, isEmpty#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark33/7.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark33/7.txt index 2261654d4a500..7a74c3c290e93 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark33/7.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark33/7.txt @@ -153,7 +153,7 @@ Arguments: X, X (6) ColumnarExchange Input [3]: [hash_partition_key#X, s_suppkey#X, s_nationkey#X] -Arguments: hashpartitioning(s_suppkey#X, 1), ENSURE_REQUIREMENTS, [s_suppkey#X, s_nationkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(s_suppkey#X, 1), ENSURE_REQUIREMENTS, [s_suppkey#X, s_nationkey#X], [plan_id=X], [shuffle_writer_type=hash] (7) ShuffleQueryStage Output [2]: [s_suppkey#X, s_nationkey#X] @@ -190,7 +190,7 @@ Arguments: X, X (15) ColumnarExchange Input [6]: [hash_partition_key#X, l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] -Arguments: hashpartitioning(l_suppkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(l_suppkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X], [plan_id=X], [shuffle_writer_type=hash] (16) ShuffleQueryStage Output [5]: [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] @@ -221,7 +221,7 @@ Arguments: X, X (23) ColumnarExchange Input [6]: [hash_partition_key#X, s_nationkey#X, l_orderkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] -Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [s_nationkey#X, l_orderkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [s_nationkey#X, l_orderkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X], [plan_id=X], [shuffle_writer_type=hash] (24) ShuffleQueryStage Output [5]: [s_nationkey#X, l_orderkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] @@ -258,7 +258,7 @@ Arguments: X, X (32) ColumnarExchange Input [3]: [hash_partition_key#X, o_orderkey#X, o_custkey#X] -Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [o_orderkey#X, o_custkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [o_orderkey#X, o_custkey#X], [plan_id=X], [shuffle_writer_type=hash] (33) ShuffleQueryStage Output [2]: [o_orderkey#X, o_custkey#X] @@ -289,7 +289,7 @@ Arguments: X, X (40) ColumnarExchange Input [6]: [hash_partition_key#X, s_nationkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, o_custkey#X] -Arguments: hashpartitioning(o_custkey#X, 1), ENSURE_REQUIREMENTS, [s_nationkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, o_custkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(o_custkey#X, 1), ENSURE_REQUIREMENTS, [s_nationkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, o_custkey#X], [plan_id=X], [shuffle_writer_type=hash] (41) ShuffleQueryStage Output [5]: [s_nationkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, o_custkey#X] @@ -326,7 +326,7 @@ Arguments: X, X (49) ColumnarExchange Input [3]: [hash_partition_key#X, c_custkey#X, c_nationkey#X] -Arguments: hashpartitioning(c_custkey#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X, c_nationkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(c_custkey#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X, c_nationkey#X], [plan_id=X], [shuffle_writer_type=hash] (50) ShuffleQueryStage Output [2]: [c_custkey#X, c_nationkey#X] @@ -357,7 +357,7 @@ Arguments: X, X (57) ColumnarExchange Input [6]: [hash_partition_key#X, s_nationkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, c_nationkey#X] -Arguments: hashpartitioning(s_nationkey#X, 1), ENSURE_REQUIREMENTS, [s_nationkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, c_nationkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(s_nationkey#X, 1), ENSURE_REQUIREMENTS, [s_nationkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, c_nationkey#X], [plan_id=X], [shuffle_writer_type=hash] (58) ShuffleQueryStage Output [5]: [s_nationkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, c_nationkey#X] @@ -394,7 +394,7 @@ Arguments: X, X (66) ColumnarExchange Input [3]: [hash_partition_key#X, n_nationkey#X, n_name#X] -Arguments: hashpartitioning(n_nationkey#X, 1), ENSURE_REQUIREMENTS, [n_nationkey#X, n_name#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(n_nationkey#X, 1), ENSURE_REQUIREMENTS, [n_nationkey#X, n_name#X], [plan_id=X], [shuffle_writer_type=hash] (67) ShuffleQueryStage Output [2]: [n_nationkey#X, n_name#X] @@ -425,7 +425,7 @@ Arguments: X, X (74) ColumnarExchange Input [6]: [hash_partition_key#X, l_extendedprice#X, l_discount#X, l_shipdate#X, c_nationkey#X, n_name#X] -Arguments: hashpartitioning(c_nationkey#X, 1), ENSURE_REQUIREMENTS, [l_extendedprice#X, l_discount#X, l_shipdate#X, c_nationkey#X, n_name#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(c_nationkey#X, 1), ENSURE_REQUIREMENTS, [l_extendedprice#X, l_discount#X, l_shipdate#X, c_nationkey#X, n_name#X], [plan_id=X], [shuffle_writer_type=hash] (75) ShuffleQueryStage Output [5]: [l_extendedprice#X, l_discount#X, l_shipdate#X, c_nationkey#X, n_name#X] @@ -480,7 +480,7 @@ Arguments: X, X (88) ColumnarExchange Input [6]: [hash_partition_key#X, supp_nation#X, cust_nation#X, l_year#X, sum#X, isEmpty#X] -Arguments: hashpartitioning(supp_nation#X, cust_nation#X, l_year#X, 1), ENSURE_REQUIREMENTS, [supp_nation#X, cust_nation#X, l_year#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(supp_nation#X, cust_nation#X, l_year#X, 1), ENSURE_REQUIREMENTS, [supp_nation#X, cust_nation#X, l_year#X, sum#X, isEmpty#X], [plan_id=X], [shuffle_writer_type=hash] (89) ShuffleQueryStage Output [5]: [supp_nation#X, cust_nation#X, l_year#X, sum#X, isEmpty#X] @@ -509,7 +509,7 @@ Arguments: X, X (95) ColumnarExchange Input [4]: [supp_nation#X, cust_nation#X, l_year#X, revenue#X] -Arguments: rangepartitioning(supp_nation#X ASC NULLS FIRST, cust_nation#X ASC NULLS FIRST, l_year#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] +Arguments: rangepartitioning(supp_nation#X ASC NULLS FIRST, cust_nation#X ASC NULLS FIRST, l_year#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [shuffle_writer_type=hash] (96) ShuffleQueryStage Output [4]: [supp_nation#X, cust_nation#X, l_year#X, revenue#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark33/8.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark33/8.txt index 3e836d995cb96..40e793695a6fd 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark33/8.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark33/8.txt @@ -202,7 +202,7 @@ Arguments: X, X (6) ColumnarExchange Input [2]: [hash_partition_key#X, p_partkey#X] -Arguments: hashpartitioning(p_partkey#X, 1), ENSURE_REQUIREMENTS, [p_partkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(p_partkey#X, 1), ENSURE_REQUIREMENTS, [p_partkey#X], [plan_id=X], [shuffle_writer_type=hash] (7) ShuffleQueryStage Output [1]: [p_partkey#X] @@ -239,7 +239,7 @@ Arguments: X, X (15) ColumnarExchange Input [6]: [hash_partition_key#X, l_orderkey#X, l_partkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] -Arguments: hashpartitioning(l_partkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_partkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(l_partkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_partkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X], [plan_id=X], [shuffle_writer_type=hash] (16) ShuffleQueryStage Output [5]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] @@ -270,7 +270,7 @@ Arguments: X, X (23) ColumnarExchange Input [5]: [hash_partition_key#X, l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] -Arguments: hashpartitioning(l_suppkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(l_suppkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X], [plan_id=X], [shuffle_writer_type=hash] (24) ShuffleQueryStage Output [4]: [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] @@ -307,7 +307,7 @@ Arguments: X, X (32) ColumnarExchange Input [3]: [hash_partition_key#X, s_suppkey#X, s_nationkey#X] -Arguments: hashpartitioning(s_suppkey#X, 1), ENSURE_REQUIREMENTS, [s_suppkey#X, s_nationkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(s_suppkey#X, 1), ENSURE_REQUIREMENTS, [s_suppkey#X, s_nationkey#X], [plan_id=X], [shuffle_writer_type=hash] (33) ShuffleQueryStage Output [2]: [s_suppkey#X, s_nationkey#X] @@ -338,7 +338,7 @@ Arguments: X, X (40) ColumnarExchange Input [5]: [hash_partition_key#X, l_orderkey#X, l_extendedprice#X, l_discount#X, s_nationkey#X] -Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_extendedprice#X, l_discount#X, s_nationkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_extendedprice#X, l_discount#X, s_nationkey#X], [plan_id=X], [shuffle_writer_type=hash] (41) ShuffleQueryStage Output [4]: [l_orderkey#X, l_extendedprice#X, l_discount#X, s_nationkey#X] @@ -375,7 +375,7 @@ Arguments: X, X (49) ColumnarExchange Input [4]: [hash_partition_key#X, o_orderkey#X, o_custkey#X, o_orderdate#X] -Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [o_orderkey#X, o_custkey#X, o_orderdate#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [o_orderkey#X, o_custkey#X, o_orderdate#X], [plan_id=X], [shuffle_writer_type=hash] (50) ShuffleQueryStage Output [3]: [o_orderkey#X, o_custkey#X, o_orderdate#X] @@ -406,7 +406,7 @@ Arguments: X, X (57) ColumnarExchange Input [6]: [hash_partition_key#X, l_extendedprice#X, l_discount#X, s_nationkey#X, o_custkey#X, o_orderdate#X] -Arguments: hashpartitioning(o_custkey#X, 1), ENSURE_REQUIREMENTS, [l_extendedprice#X, l_discount#X, s_nationkey#X, o_custkey#X, o_orderdate#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(o_custkey#X, 1), ENSURE_REQUIREMENTS, [l_extendedprice#X, l_discount#X, s_nationkey#X, o_custkey#X, o_orderdate#X], [plan_id=X], [shuffle_writer_type=hash] (58) ShuffleQueryStage Output [5]: [l_extendedprice#X, l_discount#X, s_nationkey#X, o_custkey#X, o_orderdate#X] @@ -443,7 +443,7 @@ Arguments: X, X (66) ColumnarExchange Input [3]: [hash_partition_key#X, c_custkey#X, c_nationkey#X] -Arguments: hashpartitioning(c_custkey#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X, c_nationkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(c_custkey#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X, c_nationkey#X], [plan_id=X], [shuffle_writer_type=hash] (67) ShuffleQueryStage Output [2]: [c_custkey#X, c_nationkey#X] @@ -474,7 +474,7 @@ Arguments: X, X (74) ColumnarExchange Input [6]: [hash_partition_key#X, l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderdate#X, c_nationkey#X] -Arguments: hashpartitioning(c_nationkey#X, 1), ENSURE_REQUIREMENTS, [l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderdate#X, c_nationkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(c_nationkey#X, 1), ENSURE_REQUIREMENTS, [l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderdate#X, c_nationkey#X], [plan_id=X], [shuffle_writer_type=hash] (75) ShuffleQueryStage Output [5]: [l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderdate#X, c_nationkey#X] @@ -511,7 +511,7 @@ Arguments: X, X (83) ColumnarExchange Input [3]: [hash_partition_key#X, n_nationkey#X, n_regionkey#X] -Arguments: hashpartitioning(n_nationkey#X, 1), ENSURE_REQUIREMENTS, [n_nationkey#X, n_regionkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(n_nationkey#X, 1), ENSURE_REQUIREMENTS, [n_nationkey#X, n_regionkey#X], [plan_id=X], [shuffle_writer_type=hash] (84) ShuffleQueryStage Output [2]: [n_nationkey#X, n_regionkey#X] @@ -542,7 +542,7 @@ Arguments: X, X (91) ColumnarExchange Input [6]: [hash_partition_key#X, l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderdate#X, n_regionkey#X] -Arguments: hashpartitioning(s_nationkey#X, 1), ENSURE_REQUIREMENTS, [l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderdate#X, n_regionkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(s_nationkey#X, 1), ENSURE_REQUIREMENTS, [l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderdate#X, n_regionkey#X], [plan_id=X], [shuffle_writer_type=hash] (92) ShuffleQueryStage Output [5]: [l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderdate#X, n_regionkey#X] @@ -579,7 +579,7 @@ Arguments: X, X (100) ColumnarExchange Input [3]: [hash_partition_key#X, n_nationkey#X, n_name#X] -Arguments: hashpartitioning(n_nationkey#X, 1), ENSURE_REQUIREMENTS, [n_nationkey#X, n_name#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(n_nationkey#X, 1), ENSURE_REQUIREMENTS, [n_nationkey#X, n_name#X], [plan_id=X], [shuffle_writer_type=hash] (101) ShuffleQueryStage Output [2]: [n_nationkey#X, n_name#X] @@ -610,7 +610,7 @@ Arguments: X, X (108) ColumnarExchange Input [6]: [hash_partition_key#X, l_extendedprice#X, l_discount#X, o_orderdate#X, n_regionkey#X, n_name#X] -Arguments: hashpartitioning(n_regionkey#X, 1), ENSURE_REQUIREMENTS, [l_extendedprice#X, l_discount#X, o_orderdate#X, n_regionkey#X, n_name#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(n_regionkey#X, 1), ENSURE_REQUIREMENTS, [l_extendedprice#X, l_discount#X, o_orderdate#X, n_regionkey#X, n_name#X], [plan_id=X], [shuffle_writer_type=hash] (109) ShuffleQueryStage Output [5]: [l_extendedprice#X, l_discount#X, o_orderdate#X, n_regionkey#X, n_name#X] @@ -647,7 +647,7 @@ Arguments: X, X (117) ColumnarExchange Input [2]: [hash_partition_key#X, r_regionkey#X] -Arguments: hashpartitioning(r_regionkey#X, 1), ENSURE_REQUIREMENTS, [r_regionkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(r_regionkey#X, 1), ENSURE_REQUIREMENTS, [r_regionkey#X], [plan_id=X], [shuffle_writer_type=hash] (118) ShuffleQueryStage Output [1]: [r_regionkey#X] @@ -689,7 +689,7 @@ Arguments: X, X (127) ColumnarExchange Input [6]: [hash_partition_key#X, o_year#X, sum#X, isEmpty#X, sum#X, isEmpty#X] -Arguments: hashpartitioning(o_year#X, 1), ENSURE_REQUIREMENTS, [o_year#X, sum#X, isEmpty#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(o_year#X, 1), ENSURE_REQUIREMENTS, [o_year#X, sum#X, isEmpty#X, sum#X, isEmpty#X], [plan_id=X], [shuffle_writer_type=hash] (128) ShuffleQueryStage Output [5]: [o_year#X, sum#X, isEmpty#X, sum#X, isEmpty#X] @@ -722,7 +722,7 @@ Arguments: X, X (135) ColumnarExchange Input [2]: [o_year#X, mkt_share#X] -Arguments: rangepartitioning(o_year#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] +Arguments: rangepartitioning(o_year#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [shuffle_writer_type=hash] (136) ShuffleQueryStage Output [2]: [o_year#X, mkt_share#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark33/9.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark33/9.txt index 8ed229096ba6a..ecf1daa6109ed 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark33/9.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark33/9.txt @@ -158,7 +158,7 @@ Arguments: X, X (6) ColumnarExchange Input [2]: [hash_partition_key#X, p_partkey#X] -Arguments: hashpartitioning(p_partkey#X, 1), ENSURE_REQUIREMENTS, [p_partkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(p_partkey#X, 1), ENSURE_REQUIREMENTS, [p_partkey#X], [plan_id=X], [shuffle_writer_type=hash] (7) ShuffleQueryStage Output [1]: [p_partkey#X] @@ -195,7 +195,7 @@ Arguments: X, X (15) ColumnarExchange Input [7]: [hash_partition_key#X, l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] -Arguments: hashpartitioning(l_partkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(l_partkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X], [plan_id=X], [shuffle_writer_type=hash] (16) ShuffleQueryStage Output [6]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] @@ -226,7 +226,7 @@ Arguments: X, X (23) ColumnarExchange Input [7]: [hash_partition_key#X, l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] -Arguments: hashpartitioning(l_suppkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(l_suppkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X], [plan_id=X], [shuffle_writer_type=hash] (24) ShuffleQueryStage Output [6]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] @@ -263,7 +263,7 @@ Arguments: X, X (32) ColumnarExchange Input [3]: [hash_partition_key#X, s_suppkey#X, s_nationkey#X] -Arguments: hashpartitioning(s_suppkey#X, 1), ENSURE_REQUIREMENTS, [s_suppkey#X, s_nationkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(s_suppkey#X, 1), ENSURE_REQUIREMENTS, [s_suppkey#X, s_nationkey#X], [plan_id=X], [shuffle_writer_type=hash] (33) ShuffleQueryStage Output [2]: [s_suppkey#X, s_nationkey#X] @@ -294,7 +294,7 @@ Arguments: X, X (40) ColumnarExchange Input [8]: [hash_partition_key#X, l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X] -Arguments: hashpartitioning(l_suppkey#X, l_partkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(l_suppkey#X, l_partkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X], [plan_id=X], [shuffle_writer_type=hash] (41) ShuffleQueryStage Output [7]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X] @@ -331,7 +331,7 @@ Arguments: X, X (49) ColumnarExchange Input [4]: [hash_partition_key#X, ps_partkey#X, ps_suppkey#X, ps_supplycost#X] -Arguments: hashpartitioning(ps_suppkey#X, ps_partkey#X, 1), ENSURE_REQUIREMENTS, [ps_partkey#X, ps_suppkey#X, ps_supplycost#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(ps_suppkey#X, ps_partkey#X, 1), ENSURE_REQUIREMENTS, [ps_partkey#X, ps_suppkey#X, ps_supplycost#X], [plan_id=X], [shuffle_writer_type=hash] (50) ShuffleQueryStage Output [3]: [ps_partkey#X, ps_suppkey#X, ps_supplycost#X] @@ -362,7 +362,7 @@ Arguments: X, X (57) ColumnarExchange Input [7]: [hash_partition_key#X, l_orderkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_supplycost#X] -Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_supplycost#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_supplycost#X], [plan_id=X], [shuffle_writer_type=hash] (58) ShuffleQueryStage Output [6]: [l_orderkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_supplycost#X] @@ -399,7 +399,7 @@ Arguments: X, X (66) ColumnarExchange Input [3]: [hash_partition_key#X, o_orderkey#X, o_orderdate#X] -Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [o_orderkey#X, o_orderdate#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [o_orderkey#X, o_orderdate#X], [plan_id=X], [shuffle_writer_type=hash] (67) ShuffleQueryStage Output [2]: [o_orderkey#X, o_orderdate#X] @@ -430,7 +430,7 @@ Arguments: X, X (74) ColumnarExchange Input [7]: [hash_partition_key#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_supplycost#X, o_orderdate#X] -Arguments: hashpartitioning(s_nationkey#X, 1), ENSURE_REQUIREMENTS, [l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_supplycost#X, o_orderdate#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(s_nationkey#X, 1), ENSURE_REQUIREMENTS, [l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_supplycost#X, o_orderdate#X], [plan_id=X], [shuffle_writer_type=hash] (75) ShuffleQueryStage Output [6]: [l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_supplycost#X, o_orderdate#X] @@ -467,7 +467,7 @@ Arguments: X, X (83) ColumnarExchange Input [3]: [hash_partition_key#X, n_nationkey#X, n_name#X] -Arguments: hashpartitioning(n_nationkey#X, 1), ENSURE_REQUIREMENTS, [n_nationkey#X, n_name#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(n_nationkey#X, 1), ENSURE_REQUIREMENTS, [n_nationkey#X, n_name#X], [plan_id=X], [shuffle_writer_type=hash] (84) ShuffleQueryStage Output [2]: [n_nationkey#X, n_name#X] @@ -509,7 +509,7 @@ Arguments: X, X (93) ColumnarExchange Input [5]: [hash_partition_key#X, nation#X, o_year#X, sum#X, isEmpty#X] -Arguments: hashpartitioning(nation#X, o_year#X, 1), ENSURE_REQUIREMENTS, [nation#X, o_year#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(nation#X, o_year#X, 1), ENSURE_REQUIREMENTS, [nation#X, o_year#X, sum#X, isEmpty#X], [plan_id=X], [shuffle_writer_type=hash] (94) ShuffleQueryStage Output [4]: [nation#X, o_year#X, sum#X, isEmpty#X] @@ -538,7 +538,7 @@ Arguments: X, X (100) ColumnarExchange Input [3]: [nation#X, o_year#X, sum_profit#X] -Arguments: rangepartitioning(nation#X ASC NULLS FIRST, o_year#X DESC NULLS LAST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] +Arguments: rangepartitioning(nation#X ASC NULLS FIRST, o_year#X DESC NULLS LAST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [shuffle_writer_type=hash] (101) ShuffleQueryStage Output [3]: [nation#X, o_year#X, sum_profit#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark34/1.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark34/1.txt index 2db104cfeb12e..03cab493a6c4a 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark34/1.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark34/1.txt @@ -64,7 +64,7 @@ Arguments: X, X (8) ColumnarExchange Input [18]: [hash_partition_key#X, l_returnflag#X, l_linestatus#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, count#X, sum#X, count#X, sum#X, count#X, count#X] -Arguments: hashpartitioning(l_returnflag#X, l_linestatus#X, 1), ENSURE_REQUIREMENTS, [l_returnflag#X, l_linestatus#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, count#X, sum#X, count#X, sum#X, count#X, count#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(l_returnflag#X, l_linestatus#X, 1), ENSURE_REQUIREMENTS, [l_returnflag#X, l_linestatus#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, count#X, sum#X, count#X, sum#X, count#X, count#X], [plan_id=X], [shuffle_writer_type=hash] (9) ShuffleQueryStage Output [17]: [l_returnflag#X, l_linestatus#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, count#X, sum#X, count#X, sum#X, count#X, count#X] @@ -93,7 +93,7 @@ Arguments: X, X (15) ColumnarExchange Input [10]: [l_returnflag#X, l_linestatus#X, sum_qty#X, sum_base_price#X, sum_disc_price#X, sum_charge#X, avg_qty#X, avg_price#X, avg_disc#X, count_order#X] -Arguments: rangepartitioning(l_returnflag#X ASC NULLS FIRST, l_linestatus#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] +Arguments: rangepartitioning(l_returnflag#X ASC NULLS FIRST, l_linestatus#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [shuffle_writer_type=hash] (16) ShuffleQueryStage Output [10]: [l_returnflag#X, l_linestatus#X, sum_qty#X, sum_base_price#X, sum_disc_price#X, sum_charge#X, avg_qty#X, avg_price#X, avg_disc#X, count_order#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark34/10.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark34/10.txt index 02aaa69a3ea8f..e1e4f7e748e42 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark34/10.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark34/10.txt @@ -113,7 +113,7 @@ Arguments: X, X (6) ColumnarExchange Input [8]: [hash_partition_key#X, c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X] -Arguments: hashpartitioning(c_custkey#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(c_custkey#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X], [plan_id=X], [shuffle_writer_type=hash] (7) ShuffleQueryStage Output [7]: [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X] @@ -150,7 +150,7 @@ Arguments: X, X (15) ColumnarExchange Input [3]: [hash_partition_key#X, o_orderkey#X, o_custkey#X] -Arguments: hashpartitioning(o_custkey#X, 1), ENSURE_REQUIREMENTS, [o_orderkey#X, o_custkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(o_custkey#X, 1), ENSURE_REQUIREMENTS, [o_orderkey#X, o_custkey#X], [plan_id=X], [shuffle_writer_type=hash] (16) ShuffleQueryStage Output [2]: [o_orderkey#X, o_custkey#X] @@ -182,7 +182,7 @@ Arguments: X, X (23) ColumnarExchange Input [9]: [hash_partition_key#X, c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, o_orderkey#X] -Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, o_orderkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, o_orderkey#X], [plan_id=X], [shuffle_writer_type=hash] (24) ShuffleQueryStage Output [8]: [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, o_orderkey#X] @@ -219,7 +219,7 @@ Arguments: X, X (32) ColumnarExchange Input [4]: [hash_partition_key#X, l_orderkey#X, l_extendedprice#X, l_discount#X] -Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_extendedprice#X, l_discount#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_extendedprice#X, l_discount#X], [plan_id=X], [shuffle_writer_type=hash] (33) ShuffleQueryStage Output [3]: [l_orderkey#X, l_extendedprice#X, l_discount#X] @@ -251,7 +251,7 @@ Arguments: X, X (40) ColumnarExchange Input [10]: [hash_partition_key#X, c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, l_extendedprice#X, l_discount#X] -Arguments: hashpartitioning(c_nationkey#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, l_extendedprice#X, l_discount#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(c_nationkey#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, l_extendedprice#X, l_discount#X], [plan_id=X], [shuffle_writer_type=hash] (41) ShuffleQueryStage Output [9]: [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, l_extendedprice#X, l_discount#X] @@ -288,7 +288,7 @@ Arguments: X, X (49) ColumnarExchange Input [3]: [hash_partition_key#X, n_nationkey#X, n_name#X] -Arguments: hashpartitioning(n_nationkey#X, 1), ENSURE_REQUIREMENTS, [n_nationkey#X, n_name#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(n_nationkey#X, 1), ENSURE_REQUIREMENTS, [n_nationkey#X, n_name#X], [plan_id=X], [shuffle_writer_type=hash] (50) ShuffleQueryStage Output [2]: [n_nationkey#X, n_name#X] @@ -331,7 +331,7 @@ Arguments: X, X (59) ColumnarExchange Input [10]: [hash_partition_key#X, c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, sum#X, isEmpty#X] -Arguments: hashpartitioning(c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, sum#X, isEmpty#X], [plan_id=X], [shuffle_writer_type=hash] (60) ShuffleQueryStage Output [9]: [c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, sum#X, isEmpty#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark34/11.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark34/11.txt index 39187c1a1c262..6bf42468036bb 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark34/11.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark34/11.txt @@ -97,7 +97,7 @@ Arguments: X, X (6) ColumnarExchange Input [5]: [hash_partition_key#X, ps_partkey#X, ps_suppkey#X, ps_availqty#X, ps_supplycost#X] -Arguments: hashpartitioning(ps_suppkey#X, 1), ENSURE_REQUIREMENTS, [ps_partkey#X, ps_suppkey#X, ps_availqty#X, ps_supplycost#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(ps_suppkey#X, 1), ENSURE_REQUIREMENTS, [ps_partkey#X, ps_suppkey#X, ps_availqty#X, ps_supplycost#X], [plan_id=X], [shuffle_writer_type=hash] (7) ShuffleQueryStage Output [4]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X, ps_supplycost#X] @@ -134,7 +134,7 @@ Arguments: X, X (15) ColumnarExchange Input [3]: [hash_partition_key#X, s_suppkey#X, s_nationkey#X] -Arguments: hashpartitioning(s_suppkey#X, 1), ENSURE_REQUIREMENTS, [s_suppkey#X, s_nationkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(s_suppkey#X, 1), ENSURE_REQUIREMENTS, [s_suppkey#X, s_nationkey#X], [plan_id=X], [shuffle_writer_type=hash] (16) ShuffleQueryStage Output [2]: [s_suppkey#X, s_nationkey#X] @@ -166,7 +166,7 @@ Arguments: X, X (23) ColumnarExchange Input [5]: [hash_partition_key#X, ps_partkey#X, ps_availqty#X, ps_supplycost#X, s_nationkey#X] -Arguments: hashpartitioning(s_nationkey#X, 1), ENSURE_REQUIREMENTS, [ps_partkey#X, ps_availqty#X, ps_supplycost#X, s_nationkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(s_nationkey#X, 1), ENSURE_REQUIREMENTS, [ps_partkey#X, ps_availqty#X, ps_supplycost#X, s_nationkey#X], [plan_id=X], [shuffle_writer_type=hash] (24) ShuffleQueryStage Output [4]: [ps_partkey#X, ps_availqty#X, ps_supplycost#X, s_nationkey#X] @@ -203,7 +203,7 @@ Arguments: X, X (32) ColumnarExchange Input [2]: [hash_partition_key#X, n_nationkey#X] -Arguments: hashpartitioning(n_nationkey#X, 1), ENSURE_REQUIREMENTS, [n_nationkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(n_nationkey#X, 1), ENSURE_REQUIREMENTS, [n_nationkey#X], [plan_id=X], [shuffle_writer_type=hash] (33) ShuffleQueryStage Output [1]: [n_nationkey#X] @@ -246,7 +246,7 @@ Arguments: X, X (42) ColumnarExchange Input [4]: [hash_partition_key#X, ps_partkey#X, sum#X, isEmpty#X] -Arguments: hashpartitioning(ps_partkey#X, 1), ENSURE_REQUIREMENTS, [ps_partkey#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(ps_partkey#X, 1), ENSURE_REQUIREMENTS, [ps_partkey#X, sum#X, isEmpty#X], [plan_id=X], [shuffle_writer_type=hash] (43) ShuffleQueryStage Output [3]: [ps_partkey#X, sum#X, isEmpty#X] @@ -279,7 +279,7 @@ Arguments: X, X (50) ColumnarExchange Input [2]: [ps_partkey#X, value#X] -Arguments: rangepartitioning(value#X DESC NULLS LAST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] +Arguments: rangepartitioning(value#X DESC NULLS LAST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [shuffle_writer_type=hash] (51) ShuffleQueryStage Output [2]: [ps_partkey#X, value#X] @@ -504,7 +504,7 @@ Arguments: X, X (88) ColumnarExchange Input [4]: [hash_partition_key#X, ps_suppkey#X, ps_availqty#X, ps_supplycost#X] -Arguments: hashpartitioning(ps_suppkey#X, 1), ENSURE_REQUIREMENTS, [ps_suppkey#X, ps_availqty#X, ps_supplycost#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(ps_suppkey#X, 1), ENSURE_REQUIREMENTS, [ps_suppkey#X, ps_availqty#X, ps_supplycost#X], [plan_id=X], [shuffle_writer_type=hash] (89) ShuffleQueryStage Output [3]: [ps_suppkey#X, ps_availqty#X, ps_supplycost#X] @@ -549,7 +549,7 @@ Arguments: X, X (100) ColumnarExchange Input [4]: [hash_partition_key#X, ps_availqty#X, ps_supplycost#X, s_nationkey#X] -Arguments: hashpartitioning(s_nationkey#X, 1), ENSURE_REQUIREMENTS, [ps_availqty#X, ps_supplycost#X, s_nationkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(s_nationkey#X, 1), ENSURE_REQUIREMENTS, [ps_availqty#X, ps_supplycost#X, s_nationkey#X], [plan_id=X], [shuffle_writer_type=hash] (101) ShuffleQueryStage Output [3]: [ps_availqty#X, ps_supplycost#X, s_nationkey#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark34/12.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark34/12.txt index 2ce16d0a2db54..6ab47cdeee876 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark34/12.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark34/12.txt @@ -74,7 +74,7 @@ Arguments: X, X (6) ColumnarExchange Input [3]: [hash_partition_key#X, o_orderkey#X, o_orderpriority#X] -Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [o_orderkey#X, o_orderpriority#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [o_orderkey#X, o_orderpriority#X], [plan_id=X], [shuffle_writer_type=hash] (7) ShuffleQueryStage Output [2]: [o_orderkey#X, o_orderpriority#X] @@ -111,7 +111,7 @@ Arguments: X, X (15) ColumnarExchange Input [3]: [hash_partition_key#X, l_orderkey#X, l_shipmode#X] -Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_shipmode#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_shipmode#X], [plan_id=X], [shuffle_writer_type=hash] (16) ShuffleQueryStage Output [2]: [l_orderkey#X, l_shipmode#X] @@ -154,7 +154,7 @@ Arguments: X, X (25) ColumnarExchange Input [4]: [hash_partition_key#X, l_shipmode#X, sum#X, sum#X] -Arguments: hashpartitioning(l_shipmode#X, 1), ENSURE_REQUIREMENTS, [l_shipmode#X, sum#X, sum#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(l_shipmode#X, 1), ENSURE_REQUIREMENTS, [l_shipmode#X, sum#X, sum#X], [plan_id=X], [shuffle_writer_type=hash] (26) ShuffleQueryStage Output [3]: [l_shipmode#X, sum#X, sum#X] @@ -183,7 +183,7 @@ Arguments: X, X (32) ColumnarExchange Input [3]: [l_shipmode#X, high_line_count#X, low_line_count#X] -Arguments: rangepartitioning(l_shipmode#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] +Arguments: rangepartitioning(l_shipmode#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [shuffle_writer_type=hash] (33) ShuffleQueryStage Output [3]: [l_shipmode#X, high_line_count#X, low_line_count#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark34/13.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark34/13.txt index 8989431cc83c4..a887ddc89aa2f 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark34/13.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark34/13.txt @@ -72,7 +72,7 @@ Arguments: X, X (5) ColumnarExchange Input [2]: [hash_partition_key#X, c_custkey#X] -Arguments: hashpartitioning(c_custkey#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(c_custkey#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X], [plan_id=X], [shuffle_writer_type=hash] (6) ShuffleQueryStage Output [1]: [c_custkey#X] @@ -109,7 +109,7 @@ Arguments: X, X (14) ColumnarExchange Input [3]: [hash_partition_key#X, o_orderkey#X, o_custkey#X] -Arguments: hashpartitioning(o_custkey#X, 1), ENSURE_REQUIREMENTS, [o_orderkey#X, o_custkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(o_custkey#X, 1), ENSURE_REQUIREMENTS, [o_orderkey#X, o_custkey#X], [plan_id=X], [shuffle_writer_type=hash] (15) ShuffleQueryStage Output [2]: [o_orderkey#X, o_custkey#X] @@ -170,7 +170,7 @@ Arguments: X, X (27) ColumnarExchange Input [3]: [hash_partition_key#X, c_count#X, count#X] -Arguments: hashpartitioning(c_count#X, 1), ENSURE_REQUIREMENTS, [c_count#X, count#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(c_count#X, 1), ENSURE_REQUIREMENTS, [c_count#X, count#X], [plan_id=X], [shuffle_writer_type=hash] (28) ShuffleQueryStage Output [2]: [c_count#X, count#X] @@ -199,7 +199,7 @@ Arguments: X, X (34) ColumnarExchange Input [2]: [c_count#X, custdist#X] -Arguments: rangepartitioning(custdist#X DESC NULLS LAST, c_count#X DESC NULLS LAST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] +Arguments: rangepartitioning(custdist#X DESC NULLS LAST, c_count#X DESC NULLS LAST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [shuffle_writer_type=hash] (35) ShuffleQueryStage Output [2]: [c_count#X, custdist#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark34/14.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark34/14.txt index f9e3f564c7828..92dec8827f67f 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark34/14.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark34/14.txt @@ -62,7 +62,7 @@ Arguments: X, X (6) ColumnarExchange Input [4]: [hash_partition_key#X, l_partkey#X, l_extendedprice#X, l_discount#X] -Arguments: hashpartitioning(l_partkey#X, 1), ENSURE_REQUIREMENTS, [l_partkey#X, l_extendedprice#X, l_discount#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(l_partkey#X, 1), ENSURE_REQUIREMENTS, [l_partkey#X, l_extendedprice#X, l_discount#X], [plan_id=X], [shuffle_writer_type=hash] (7) ShuffleQueryStage Output [3]: [l_partkey#X, l_extendedprice#X, l_discount#X] @@ -99,7 +99,7 @@ Arguments: X, X (15) ColumnarExchange Input [3]: [hash_partition_key#X, p_partkey#X, p_type#X] -Arguments: hashpartitioning(p_partkey#X, 1), ENSURE_REQUIREMENTS, [p_partkey#X, p_type#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(p_partkey#X, 1), ENSURE_REQUIREMENTS, [p_partkey#X, p_type#X], [plan_id=X], [shuffle_writer_type=hash] (16) ShuffleQueryStage Output [2]: [p_partkey#X, p_type#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark34/15.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark34/15.txt index ca94a960e811f..68a0bbc910791 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark34/15.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark34/15.txt @@ -70,7 +70,7 @@ Arguments: X, X (6) ColumnarExchange Input [5]: [hash_partition_key#X, s_suppkey#X, s_name#X, s_address#X, s_phone#X] -Arguments: hashpartitioning(s_suppkey#X, 1), ENSURE_REQUIREMENTS, [s_suppkey#X, s_name#X, s_address#X, s_phone#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(s_suppkey#X, 1), ENSURE_REQUIREMENTS, [s_suppkey#X, s_name#X, s_address#X, s_phone#X], [plan_id=X], [shuffle_writer_type=hash] (7) ShuffleQueryStage Output [4]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X] @@ -118,7 +118,7 @@ Arguments: X, X (17) ColumnarExchange Input [4]: [hash_partition_key#X, l_suppkey#X, sum#X, isEmpty#X] -Arguments: hashpartitioning(l_suppkey#X, 1), ENSURE_REQUIREMENTS, [l_suppkey#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(l_suppkey#X, 1), ENSURE_REQUIREMENTS, [l_suppkey#X, sum#X, isEmpty#X], [plan_id=X], [shuffle_writer_type=hash] (18) ShuffleQueryStage Output [3]: [l_suppkey#X, sum#X, isEmpty#X] @@ -161,7 +161,7 @@ Arguments: X, X (27) ColumnarExchange Input [5]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X, total_revenue#X] -Arguments: rangepartitioning(s_suppkey#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] +Arguments: rangepartitioning(s_suppkey#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [shuffle_writer_type=hash] (28) ShuffleQueryStage Output [5]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X, total_revenue#X] @@ -322,7 +322,7 @@ Arguments: X, X (55) ColumnarExchange Input [4]: [hash_partition_key#X, l_suppkey#X, sum#X, isEmpty#X] -Arguments: hashpartitioning(l_suppkey#X, 1), ENSURE_REQUIREMENTS, [l_suppkey#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(l_suppkey#X, 1), ENSURE_REQUIREMENTS, [l_suppkey#X, sum#X, isEmpty#X], [plan_id=X], [shuffle_writer_type=hash] (56) ShuffleQueryStage Output [3]: [l_suppkey#X, sum#X, isEmpty#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark34/16.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark34/16.txt index 5813bbd1af9b7..933eb6c625a84 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark34/16.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark34/16.txt @@ -88,7 +88,7 @@ Arguments: X, X (6) ColumnarExchange Input [3]: [hash_partition_key#X, ps_partkey#X, ps_suppkey#X] -Arguments: hashpartitioning(ps_partkey#X, 1), ENSURE_REQUIREMENTS, [ps_partkey#X, ps_suppkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(ps_partkey#X, 1), ENSURE_REQUIREMENTS, [ps_partkey#X, ps_suppkey#X], [plan_id=X], [shuffle_writer_type=hash] (7) ShuffleQueryStage Output [2]: [ps_partkey#X, ps_suppkey#X] @@ -125,7 +125,7 @@ Arguments: X, X (15) ColumnarExchange Input [5]: [hash_partition_key#X, p_partkey#X, p_brand#X, p_type#X, p_size#X] -Arguments: hashpartitioning(p_partkey#X, 1), ENSURE_REQUIREMENTS, [p_partkey#X, p_brand#X, p_type#X, p_size#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(p_partkey#X, 1), ENSURE_REQUIREMENTS, [p_partkey#X, p_brand#X, p_type#X, p_size#X], [plan_id=X], [shuffle_writer_type=hash] (16) ShuffleQueryStage Output [4]: [p_partkey#X, p_brand#X, p_type#X, p_size#X] @@ -168,7 +168,7 @@ Arguments: X, X (25) ColumnarExchange Input [5]: [hash_partition_key#X, p_brand#X, p_type#X, p_size#X, ps_suppkey#X] -Arguments: hashpartitioning(p_brand#X, p_type#X, p_size#X, ps_suppkey#X, 1), ENSURE_REQUIREMENTS, [p_brand#X, p_type#X, p_size#X, ps_suppkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(p_brand#X, p_type#X, p_size#X, ps_suppkey#X, 1), ENSURE_REQUIREMENTS, [p_brand#X, p_type#X, p_size#X, ps_suppkey#X], [plan_id=X], [shuffle_writer_type=hash] (26) ShuffleQueryStage Output [4]: [p_brand#X, p_type#X, p_size#X, ps_suppkey#X] @@ -208,7 +208,7 @@ Arguments: X, X (34) ColumnarExchange Input [5]: [hash_partition_key#X, p_brand#X, p_type#X, p_size#X, count#X] -Arguments: hashpartitioning(p_brand#X, p_type#X, p_size#X, 1), ENSURE_REQUIREMENTS, [p_brand#X, p_type#X, p_size#X, count#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(p_brand#X, p_type#X, p_size#X, 1), ENSURE_REQUIREMENTS, [p_brand#X, p_type#X, p_size#X, count#X], [plan_id=X], [shuffle_writer_type=hash] (35) ShuffleQueryStage Output [4]: [p_brand#X, p_type#X, p_size#X, count#X] @@ -237,7 +237,7 @@ Arguments: X, X (41) ColumnarExchange Input [4]: [p_brand#X, p_type#X, p_size#X, supplier_cnt#X] -Arguments: rangepartitioning(supplier_cnt#X DESC NULLS LAST, p_brand#X ASC NULLS FIRST, p_type#X ASC NULLS FIRST, p_size#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] +Arguments: rangepartitioning(supplier_cnt#X DESC NULLS LAST, p_brand#X ASC NULLS FIRST, p_type#X ASC NULLS FIRST, p_size#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [shuffle_writer_type=hash] (42) ShuffleQueryStage Output [4]: [p_brand#X, p_type#X, p_size#X, supplier_cnt#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark34/17.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark34/17.txt index 8b4e121891f29..74f04de98e047 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark34/17.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark34/17.txt @@ -84,7 +84,7 @@ Arguments: X, X (6) ColumnarExchange Input [4]: [hash_partition_key#X, l_partkey#X, l_quantity#X, l_extendedprice#X] -Arguments: hashpartitioning(l_partkey#X, 1), ENSURE_REQUIREMENTS, [l_partkey#X, l_quantity#X, l_extendedprice#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(l_partkey#X, 1), ENSURE_REQUIREMENTS, [l_partkey#X, l_quantity#X, l_extendedprice#X], [plan_id=X], [shuffle_writer_type=hash] (7) ShuffleQueryStage Output [3]: [l_partkey#X, l_quantity#X, l_extendedprice#X] @@ -121,7 +121,7 @@ Arguments: X, X (15) ColumnarExchange Input [2]: [hash_partition_key#X, p_partkey#X] -Arguments: hashpartitioning(p_partkey#X, 1), ENSURE_REQUIREMENTS, [p_partkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(p_partkey#X, 1), ENSURE_REQUIREMENTS, [p_partkey#X], [plan_id=X], [shuffle_writer_type=hash] (16) ShuffleQueryStage Output [1]: [p_partkey#X] @@ -175,7 +175,7 @@ Arguments: X, X (27) ColumnarExchange Input [4]: [hash_partition_key#X, l_partkey#X, sum#X, count#X] -Arguments: hashpartitioning(l_partkey#X, 1), ENSURE_REQUIREMENTS, [l_partkey#X, sum#X, count#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(l_partkey#X, 1), ENSURE_REQUIREMENTS, [l_partkey#X, sum#X, count#X], [plan_id=X], [shuffle_writer_type=hash] (28) ShuffleQueryStage Output [3]: [l_partkey#X, sum#X, count#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark34/18.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark34/18.txt index 3f7c919c62129..7fc42261f6ddd 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark34/18.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark34/18.txt @@ -124,7 +124,7 @@ Arguments: X, X (6) ColumnarExchange Input [3]: [hash_partition_key#X, c_custkey#X, c_name#X] -Arguments: hashpartitioning(c_custkey#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X, c_name#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(c_custkey#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X, c_name#X], [plan_id=X], [shuffle_writer_type=hash] (7) ShuffleQueryStage Output [2]: [c_custkey#X, c_name#X] @@ -161,7 +161,7 @@ Arguments: X, X (15) ColumnarExchange Input [5]: [hash_partition_key#X, o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X] -Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X], [plan_id=X], [shuffle_writer_type=hash] (16) ShuffleQueryStage Output [4]: [o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X] @@ -200,7 +200,7 @@ Arguments: X, X (24) ColumnarExchange Input [4]: [hash_partition_key#X, l_orderkey#X, sum#X, isEmpty#X] -Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, sum#X, isEmpty#X], [plan_id=X], [shuffle_writer_type=hash] (25) ShuffleQueryStage Output [3]: [l_orderkey#X, sum#X, isEmpty#X] @@ -247,7 +247,7 @@ Arguments: X, X (35) ColumnarExchange Input [5]: [hash_partition_key#X, o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X] -Arguments: hashpartitioning(o_custkey#X, 1), ENSURE_REQUIREMENTS, [o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(o_custkey#X, 1), ENSURE_REQUIREMENTS, [o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X], [plan_id=X], [shuffle_writer_type=hash] (36) ShuffleQueryStage Output [4]: [o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X] @@ -279,7 +279,7 @@ Arguments: X, X (43) ColumnarExchange Input [6]: [hash_partition_key#X, c_custkey#X, c_name#X, o_orderkey#X, o_totalprice#X, o_orderdate#X] -Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X, c_name#X, o_orderkey#X, o_totalprice#X, o_orderdate#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X, c_name#X, o_orderkey#X, o_totalprice#X, o_orderdate#X], [plan_id=X], [shuffle_writer_type=hash] (44) ShuffleQueryStage Output [5]: [c_custkey#X, c_name#X, o_orderkey#X, o_totalprice#X, o_orderdate#X] @@ -316,7 +316,7 @@ Arguments: X, X (52) ColumnarExchange Input [3]: [hash_partition_key#X, l_orderkey#X, l_quantity#X] -Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_quantity#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_quantity#X], [plan_id=X], [shuffle_writer_type=hash] (53) ShuffleQueryStage Output [2]: [l_orderkey#X, l_quantity#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark34/19.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark34/19.txt index f29554305a6df..e022b9cf6311c 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark34/19.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark34/19.txt @@ -61,7 +61,7 @@ Arguments: X, X (6) ColumnarExchange Input [5]: [hash_partition_key#X, l_partkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] -Arguments: hashpartitioning(l_partkey#X, 1), ENSURE_REQUIREMENTS, [l_partkey#X, l_quantity#X, l_extendedprice#X, l_discount#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(l_partkey#X, 1), ENSURE_REQUIREMENTS, [l_partkey#X, l_quantity#X, l_extendedprice#X, l_discount#X], [plan_id=X], [shuffle_writer_type=hash] (7) ShuffleQueryStage Output [4]: [l_partkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] @@ -98,7 +98,7 @@ Arguments: X, X (15) ColumnarExchange Input [5]: [hash_partition_key#X, p_partkey#X, p_brand#X, p_size#X, p_container#X] -Arguments: hashpartitioning(p_partkey#X, 1), ENSURE_REQUIREMENTS, [p_partkey#X, p_brand#X, p_size#X, p_container#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(p_partkey#X, 1), ENSURE_REQUIREMENTS, [p_partkey#X, p_brand#X, p_size#X, p_container#X], [plan_id=X], [shuffle_writer_type=hash] (16) ShuffleQueryStage Output [4]: [p_partkey#X, p_brand#X, p_size#X, p_container#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark34/20.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark34/20.txt index 227d18bd0f213..be57604c31f9f 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark34/20.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark34/20.txt @@ -151,7 +151,7 @@ Arguments: X, X (6) ColumnarExchange Input [5]: [hash_partition_key#X, s_suppkey#X, s_name#X, s_address#X, s_nationkey#X] -Arguments: hashpartitioning(s_suppkey#X, 1), ENSURE_REQUIREMENTS, [s_suppkey#X, s_name#X, s_address#X, s_nationkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(s_suppkey#X, 1), ENSURE_REQUIREMENTS, [s_suppkey#X, s_name#X, s_address#X, s_nationkey#X], [plan_id=X], [shuffle_writer_type=hash] (7) ShuffleQueryStage Output [4]: [s_suppkey#X, s_name#X, s_address#X, s_nationkey#X] @@ -188,7 +188,7 @@ Arguments: X, X (15) ColumnarExchange Input [4]: [hash_partition_key#X, ps_partkey#X, ps_suppkey#X, ps_availqty#X] -Arguments: hashpartitioning(ps_partkey#X, 1), ENSURE_REQUIREMENTS, [ps_partkey#X, ps_suppkey#X, ps_availqty#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(ps_partkey#X, 1), ENSURE_REQUIREMENTS, [ps_partkey#X, ps_suppkey#X, ps_availqty#X], [plan_id=X], [shuffle_writer_type=hash] (16) ShuffleQueryStage Output [3]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X] @@ -225,7 +225,7 @@ Arguments: X, X (24) ColumnarExchange Input [2]: [hash_partition_key#X, p_partkey#X] -Arguments: hashpartitioning(p_partkey#X, 1), ENSURE_REQUIREMENTS, [p_partkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(p_partkey#X, 1), ENSURE_REQUIREMENTS, [p_partkey#X], [plan_id=X], [shuffle_writer_type=hash] (25) ShuffleQueryStage Output [1]: [p_partkey#X] @@ -257,7 +257,7 @@ Arguments: X, X (32) ColumnarExchange Input [4]: [hash_partition_key#X, ps_partkey#X, ps_suppkey#X, ps_availqty#X] -Arguments: hashpartitioning(ps_partkey#X, ps_suppkey#X, 1), ENSURE_REQUIREMENTS, [ps_partkey#X, ps_suppkey#X, ps_availqty#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(ps_partkey#X, ps_suppkey#X, 1), ENSURE_REQUIREMENTS, [ps_partkey#X, ps_suppkey#X, ps_availqty#X], [plan_id=X], [shuffle_writer_type=hash] (33) ShuffleQueryStage Output [3]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X] @@ -294,7 +294,7 @@ Arguments: X, X (41) ColumnarExchange Input [4]: [hash_partition_key#X, l_partkey#X, l_suppkey#X, l_quantity#X] -Arguments: hashpartitioning(l_partkey#X, 1), ENSURE_REQUIREMENTS, [l_partkey#X, l_suppkey#X, l_quantity#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(l_partkey#X, 1), ENSURE_REQUIREMENTS, [l_partkey#X, l_suppkey#X, l_quantity#X], [plan_id=X], [shuffle_writer_type=hash] (42) ShuffleQueryStage Output [3]: [l_partkey#X, l_suppkey#X, l_quantity#X] @@ -361,7 +361,7 @@ Arguments: X, X (57) ColumnarExchange Input [4]: [hash_partition_key#X, (0.5 * sum(l_quantity))#X, l_partkey#X, l_suppkey#X] -Arguments: hashpartitioning(l_partkey#X, l_suppkey#X, 1), ENSURE_REQUIREMENTS, [(0.5 * sum(l_quantity))#X, l_partkey#X, l_suppkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(l_partkey#X, l_suppkey#X, 1), ENSURE_REQUIREMENTS, [(0.5 * sum(l_quantity))#X, l_partkey#X, l_suppkey#X], [plan_id=X], [shuffle_writer_type=hash] (58) ShuffleQueryStage Output [3]: [(0.5 * sum(l_quantity))#X, l_partkey#X, l_suppkey#X] @@ -393,7 +393,7 @@ Arguments: X, X (65) ColumnarExchange Input [2]: [hash_partition_key#X, ps_suppkey#X] -Arguments: hashpartitioning(ps_suppkey#X, 1), ENSURE_REQUIREMENTS, [ps_suppkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(ps_suppkey#X, 1), ENSURE_REQUIREMENTS, [ps_suppkey#X], [plan_id=X], [shuffle_writer_type=hash] (66) ShuffleQueryStage Output [1]: [ps_suppkey#X] @@ -425,7 +425,7 @@ Arguments: X, X (73) ColumnarExchange Input [4]: [hash_partition_key#X, s_name#X, s_address#X, s_nationkey#X] -Arguments: hashpartitioning(s_nationkey#X, 1), ENSURE_REQUIREMENTS, [s_name#X, s_address#X, s_nationkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(s_nationkey#X, 1), ENSURE_REQUIREMENTS, [s_name#X, s_address#X, s_nationkey#X], [plan_id=X], [shuffle_writer_type=hash] (74) ShuffleQueryStage Output [3]: [s_name#X, s_address#X, s_nationkey#X] @@ -462,7 +462,7 @@ Arguments: X, X (82) ColumnarExchange Input [2]: [hash_partition_key#X, n_nationkey#X] -Arguments: hashpartitioning(n_nationkey#X, 1), ENSURE_REQUIREMENTS, [n_nationkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(n_nationkey#X, 1), ENSURE_REQUIREMENTS, [n_nationkey#X], [plan_id=X], [shuffle_writer_type=hash] (83) ShuffleQueryStage Output [1]: [n_nationkey#X] @@ -494,7 +494,7 @@ Arguments: X, X (90) ColumnarExchange Input [2]: [s_name#X, s_address#X] -Arguments: rangepartitioning(s_name#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] +Arguments: rangepartitioning(s_name#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [shuffle_writer_type=hash] (91) ShuffleQueryStage Output [2]: [s_name#X, s_address#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark34/21.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark34/21.txt index 1ffbdd45fef90..a7b10521886e3 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark34/21.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark34/21.txt @@ -144,7 +144,7 @@ Arguments: X, X (6) ColumnarExchange Input [4]: [hash_partition_key#X, s_suppkey#X, s_name#X, s_nationkey#X] -Arguments: hashpartitioning(s_suppkey#X, 1), ENSURE_REQUIREMENTS, [s_suppkey#X, s_name#X, s_nationkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(s_suppkey#X, 1), ENSURE_REQUIREMENTS, [s_suppkey#X, s_name#X, s_nationkey#X], [plan_id=X], [shuffle_writer_type=hash] (7) ShuffleQueryStage Output [3]: [s_suppkey#X, s_name#X, s_nationkey#X] @@ -181,7 +181,7 @@ Arguments: X, X (15) ColumnarExchange Input [3]: [hash_partition_key#X, l_orderkey#X, l_suppkey#X] -Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_suppkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_suppkey#X], [plan_id=X], [shuffle_writer_type=hash] (16) ShuffleQueryStage Output [2]: [l_orderkey#X, l_suppkey#X] @@ -213,7 +213,7 @@ Arguments: X, X (23) ColumnarExchange Input [3]: [hash_partition_key#X, l_orderkey#X, l_suppkey#X] -Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_suppkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_suppkey#X], [plan_id=X], [shuffle_writer_type=hash] (24) ShuffleQueryStage Output [2]: [l_orderkey#X, l_suppkey#X] @@ -256,7 +256,7 @@ Arguments: X, X (33) ColumnarExchange Input [3]: [hash_partition_key#X, l_orderkey#X, l_suppkey#X] -Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_suppkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_suppkey#X], [plan_id=X], [shuffle_writer_type=hash] (34) ShuffleQueryStage Output [2]: [l_orderkey#X, l_suppkey#X] @@ -288,7 +288,7 @@ Arguments: X, X (41) ColumnarExchange Input [3]: [hash_partition_key#X, l_orderkey#X, l_suppkey#X] -Arguments: hashpartitioning(l_suppkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_suppkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(l_suppkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_suppkey#X], [plan_id=X], [shuffle_writer_type=hash] (42) ShuffleQueryStage Output [2]: [l_orderkey#X, l_suppkey#X] @@ -320,7 +320,7 @@ Arguments: X, X (49) ColumnarExchange Input [4]: [hash_partition_key#X, s_name#X, s_nationkey#X, l_orderkey#X] -Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [s_name#X, s_nationkey#X, l_orderkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [s_name#X, s_nationkey#X, l_orderkey#X], [plan_id=X], [shuffle_writer_type=hash] (50) ShuffleQueryStage Output [3]: [s_name#X, s_nationkey#X, l_orderkey#X] @@ -357,7 +357,7 @@ Arguments: X, X (58) ColumnarExchange Input [2]: [hash_partition_key#X, o_orderkey#X] -Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [o_orderkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [o_orderkey#X], [plan_id=X], [shuffle_writer_type=hash] (59) ShuffleQueryStage Output [1]: [o_orderkey#X] @@ -389,7 +389,7 @@ Arguments: X, X (66) ColumnarExchange Input [3]: [hash_partition_key#X, s_name#X, s_nationkey#X] -Arguments: hashpartitioning(s_nationkey#X, 1), ENSURE_REQUIREMENTS, [s_name#X, s_nationkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(s_nationkey#X, 1), ENSURE_REQUIREMENTS, [s_name#X, s_nationkey#X], [plan_id=X], [shuffle_writer_type=hash] (67) ShuffleQueryStage Output [2]: [s_name#X, s_nationkey#X] @@ -426,7 +426,7 @@ Arguments: X, X (75) ColumnarExchange Input [2]: [hash_partition_key#X, n_nationkey#X] -Arguments: hashpartitioning(n_nationkey#X, 1), ENSURE_REQUIREMENTS, [n_nationkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(n_nationkey#X, 1), ENSURE_REQUIREMENTS, [n_nationkey#X], [plan_id=X], [shuffle_writer_type=hash] (76) ShuffleQueryStage Output [1]: [n_nationkey#X] @@ -469,7 +469,7 @@ Arguments: X, X (85) ColumnarExchange Input [3]: [hash_partition_key#X, s_name#X, count#X] -Arguments: hashpartitioning(s_name#X, 1), ENSURE_REQUIREMENTS, [s_name#X, count#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(s_name#X, 1), ENSURE_REQUIREMENTS, [s_name#X, count#X], [plan_id=X], [shuffle_writer_type=hash] (86) ShuffleQueryStage Output [2]: [s_name#X, count#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark34/22.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark34/22.txt index fd4141aec0af9..150060e8c04b0 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark34/22.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark34/22.txt @@ -71,7 +71,7 @@ Arguments: X, X (6) ColumnarExchange Input [4]: [hash_partition_key#X, c_custkey#X, c_phone#X, c_acctbal#X] -Arguments: hashpartitioning(c_custkey#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X, c_phone#X, c_acctbal#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(c_custkey#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X, c_phone#X, c_acctbal#X], [plan_id=X], [shuffle_writer_type=hash] (7) ShuffleQueryStage Output [3]: [c_custkey#X, c_phone#X, c_acctbal#X] @@ -103,7 +103,7 @@ Arguments: X, X (14) ColumnarExchange Input [2]: [hash_partition_key#X, o_custkey#X] -Arguments: hashpartitioning(o_custkey#X, 1), ENSURE_REQUIREMENTS, [o_custkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(o_custkey#X, 1), ENSURE_REQUIREMENTS, [o_custkey#X], [plan_id=X], [shuffle_writer_type=hash] (15) ShuffleQueryStage Output [1]: [o_custkey#X] @@ -146,7 +146,7 @@ Arguments: X, X (24) ColumnarExchange Input [5]: [hash_partition_key#X, cntrycode#X, count#X, sum#X, isEmpty#X] -Arguments: hashpartitioning(cntrycode#X, 1), ENSURE_REQUIREMENTS, [cntrycode#X, count#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(cntrycode#X, 1), ENSURE_REQUIREMENTS, [cntrycode#X, count#X, sum#X, isEmpty#X], [plan_id=X], [shuffle_writer_type=hash] (25) ShuffleQueryStage Output [4]: [cntrycode#X, count#X, sum#X, isEmpty#X] @@ -175,7 +175,7 @@ Arguments: X, X (31) ColumnarExchange Input [3]: [cntrycode#X, numcust#X, totacctbal#X] -Arguments: rangepartitioning(cntrycode#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] +Arguments: rangepartitioning(cntrycode#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [shuffle_writer_type=hash] (32) ShuffleQueryStage Output [3]: [cntrycode#X, numcust#X, totacctbal#X] @@ -327,7 +327,7 @@ Arguments: X, X (59) ColumnarExchange Input [2]: [sum#X, count#X] -Arguments: SinglePartition, ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] +Arguments: SinglePartition, ENSURE_REQUIREMENTS, [plan_id=X], [shuffle_writer_type=hash] (60) ShuffleQueryStage Output [2]: [sum#X, count#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark34/3.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark34/3.txt index c7aed3993470e..e3697b056ec0b 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark34/3.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark34/3.txt @@ -86,7 +86,7 @@ Arguments: X, X (6) ColumnarExchange Input [2]: [hash_partition_key#X, c_custkey#X] -Arguments: hashpartitioning(c_custkey#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(c_custkey#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X], [plan_id=X], [shuffle_writer_type=hash] (7) ShuffleQueryStage Output [1]: [c_custkey#X] @@ -123,7 +123,7 @@ Arguments: X, X (15) ColumnarExchange Input [5]: [hash_partition_key#X, o_orderkey#X, o_custkey#X, o_orderdate#X, o_shippriority#X] -Arguments: hashpartitioning(o_custkey#X, 1), ENSURE_REQUIREMENTS, [o_orderkey#X, o_custkey#X, o_orderdate#X, o_shippriority#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(o_custkey#X, 1), ENSURE_REQUIREMENTS, [o_orderkey#X, o_custkey#X, o_orderdate#X, o_shippriority#X], [plan_id=X], [shuffle_writer_type=hash] (16) ShuffleQueryStage Output [4]: [o_orderkey#X, o_custkey#X, o_orderdate#X, o_shippriority#X] @@ -155,7 +155,7 @@ Arguments: X, X (23) ColumnarExchange Input [4]: [hash_partition_key#X, o_orderkey#X, o_orderdate#X, o_shippriority#X] -Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [o_orderkey#X, o_orderdate#X, o_shippriority#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [o_orderkey#X, o_orderdate#X, o_shippriority#X], [plan_id=X], [shuffle_writer_type=hash] (24) ShuffleQueryStage Output [3]: [o_orderkey#X, o_orderdate#X, o_shippriority#X] @@ -192,7 +192,7 @@ Arguments: X, X (32) ColumnarExchange Input [4]: [hash_partition_key#X, l_orderkey#X, l_extendedprice#X, l_discount#X] -Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_extendedprice#X, l_discount#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_extendedprice#X, l_discount#X], [plan_id=X], [shuffle_writer_type=hash] (33) ShuffleQueryStage Output [3]: [l_orderkey#X, l_extendedprice#X, l_discount#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark34/4.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark34/4.txt index 72999dce97b3c..65488adcaca5e 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark34/4.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark34/4.txt @@ -75,7 +75,7 @@ Arguments: X, X (6) ColumnarExchange Input [3]: [hash_partition_key#X, o_orderkey#X, o_orderpriority#X] -Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [o_orderkey#X, o_orderpriority#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [o_orderkey#X, o_orderpriority#X], [plan_id=X], [shuffle_writer_type=hash] (7) ShuffleQueryStage Output [2]: [o_orderkey#X, o_orderpriority#X] @@ -112,7 +112,7 @@ Arguments: X, X (15) ColumnarExchange Input [2]: [hash_partition_key#X, l_orderkey#X] -Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X], [plan_id=X], [shuffle_writer_type=hash] (16) ShuffleQueryStage Output [1]: [l_orderkey#X] @@ -155,7 +155,7 @@ Arguments: X, X (25) ColumnarExchange Input [3]: [hash_partition_key#X, o_orderpriority#X, count#X] -Arguments: hashpartitioning(o_orderpriority#X, 1), ENSURE_REQUIREMENTS, [o_orderpriority#X, count#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(o_orderpriority#X, 1), ENSURE_REQUIREMENTS, [o_orderpriority#X, count#X], [plan_id=X], [shuffle_writer_type=hash] (26) ShuffleQueryStage Output [2]: [o_orderpriority#X, count#X] @@ -184,7 +184,7 @@ Arguments: X, X (32) ColumnarExchange Input [2]: [o_orderpriority#X, order_count#X] -Arguments: rangepartitioning(o_orderpriority#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] +Arguments: rangepartitioning(o_orderpriority#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [shuffle_writer_type=hash] (33) ShuffleQueryStage Output [2]: [o_orderpriority#X, order_count#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark34/5.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark34/5.txt index 388d18a0314d7..2e872491d4c17 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark34/5.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark34/5.txt @@ -159,7 +159,7 @@ Arguments: X, X (6) ColumnarExchange Input [3]: [hash_partition_key#X, c_custkey#X, c_nationkey#X] -Arguments: hashpartitioning(c_custkey#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X, c_nationkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(c_custkey#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X, c_nationkey#X], [plan_id=X], [shuffle_writer_type=hash] (7) ShuffleQueryStage Output [2]: [c_custkey#X, c_nationkey#X] @@ -196,7 +196,7 @@ Arguments: X, X (15) ColumnarExchange Input [3]: [hash_partition_key#X, o_orderkey#X, o_custkey#X] -Arguments: hashpartitioning(o_custkey#X, 1), ENSURE_REQUIREMENTS, [o_orderkey#X, o_custkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(o_custkey#X, 1), ENSURE_REQUIREMENTS, [o_orderkey#X, o_custkey#X], [plan_id=X], [shuffle_writer_type=hash] (16) ShuffleQueryStage Output [2]: [o_orderkey#X, o_custkey#X] @@ -228,7 +228,7 @@ Arguments: X, X (23) ColumnarExchange Input [3]: [hash_partition_key#X, c_nationkey#X, o_orderkey#X] -Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [c_nationkey#X, o_orderkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [c_nationkey#X, o_orderkey#X], [plan_id=X], [shuffle_writer_type=hash] (24) ShuffleQueryStage Output [2]: [c_nationkey#X, o_orderkey#X] @@ -265,7 +265,7 @@ Arguments: X, X (32) ColumnarExchange Input [5]: [hash_partition_key#X, l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] -Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X], [plan_id=X], [shuffle_writer_type=hash] (33) ShuffleQueryStage Output [4]: [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] @@ -297,7 +297,7 @@ Arguments: X, X (40) ColumnarExchange Input [5]: [hash_partition_key#X, c_nationkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] -Arguments: hashpartitioning(l_suppkey#X, c_nationkey#X, 1), ENSURE_REQUIREMENTS, [c_nationkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(l_suppkey#X, c_nationkey#X, 1), ENSURE_REQUIREMENTS, [c_nationkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X], [plan_id=X], [shuffle_writer_type=hash] (41) ShuffleQueryStage Output [4]: [c_nationkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] @@ -334,7 +334,7 @@ Arguments: X, X (49) ColumnarExchange Input [3]: [hash_partition_key#X, s_suppkey#X, s_nationkey#X] -Arguments: hashpartitioning(s_suppkey#X, s_nationkey#X, 1), ENSURE_REQUIREMENTS, [s_suppkey#X, s_nationkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(s_suppkey#X, s_nationkey#X, 1), ENSURE_REQUIREMENTS, [s_suppkey#X, s_nationkey#X], [plan_id=X], [shuffle_writer_type=hash] (50) ShuffleQueryStage Output [2]: [s_suppkey#X, s_nationkey#X] @@ -366,7 +366,7 @@ Arguments: X, X (57) ColumnarExchange Input [4]: [hash_partition_key#X, l_extendedprice#X, l_discount#X, s_nationkey#X] -Arguments: hashpartitioning(s_nationkey#X, 1), ENSURE_REQUIREMENTS, [l_extendedprice#X, l_discount#X, s_nationkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(s_nationkey#X, 1), ENSURE_REQUIREMENTS, [l_extendedprice#X, l_discount#X, s_nationkey#X], [plan_id=X], [shuffle_writer_type=hash] (58) ShuffleQueryStage Output [3]: [l_extendedprice#X, l_discount#X, s_nationkey#X] @@ -403,7 +403,7 @@ Arguments: X, X (66) ColumnarExchange Input [4]: [hash_partition_key#X, n_nationkey#X, n_name#X, n_regionkey#X] -Arguments: hashpartitioning(n_nationkey#X, 1), ENSURE_REQUIREMENTS, [n_nationkey#X, n_name#X, n_regionkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(n_nationkey#X, 1), ENSURE_REQUIREMENTS, [n_nationkey#X, n_name#X, n_regionkey#X], [plan_id=X], [shuffle_writer_type=hash] (67) ShuffleQueryStage Output [3]: [n_nationkey#X, n_name#X, n_regionkey#X] @@ -435,7 +435,7 @@ Arguments: X, X (74) ColumnarExchange Input [5]: [hash_partition_key#X, l_extendedprice#X, l_discount#X, n_name#X, n_regionkey#X] -Arguments: hashpartitioning(n_regionkey#X, 1), ENSURE_REQUIREMENTS, [l_extendedprice#X, l_discount#X, n_name#X, n_regionkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(n_regionkey#X, 1), ENSURE_REQUIREMENTS, [l_extendedprice#X, l_discount#X, n_name#X, n_regionkey#X], [plan_id=X], [shuffle_writer_type=hash] (75) ShuffleQueryStage Output [4]: [l_extendedprice#X, l_discount#X, n_name#X, n_regionkey#X] @@ -472,7 +472,7 @@ Arguments: X, X (83) ColumnarExchange Input [2]: [hash_partition_key#X, r_regionkey#X] -Arguments: hashpartitioning(r_regionkey#X, 1), ENSURE_REQUIREMENTS, [r_regionkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(r_regionkey#X, 1), ENSURE_REQUIREMENTS, [r_regionkey#X], [plan_id=X], [shuffle_writer_type=hash] (84) ShuffleQueryStage Output [1]: [r_regionkey#X] @@ -515,7 +515,7 @@ Arguments: X, X (93) ColumnarExchange Input [4]: [hash_partition_key#X, n_name#X, sum#X, isEmpty#X] -Arguments: hashpartitioning(n_name#X, 1), ENSURE_REQUIREMENTS, [n_name#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(n_name#X, 1), ENSURE_REQUIREMENTS, [n_name#X, sum#X, isEmpty#X], [plan_id=X], [shuffle_writer_type=hash] (94) ShuffleQueryStage Output [3]: [n_name#X, sum#X, isEmpty#X] @@ -544,7 +544,7 @@ Arguments: X, X (100) ColumnarExchange Input [2]: [n_name#X, revenue#X] -Arguments: rangepartitioning(revenue#X DESC NULLS LAST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] +Arguments: rangepartitioning(revenue#X DESC NULLS LAST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [shuffle_writer_type=hash] (101) ShuffleQueryStage Output [2]: [n_name#X, revenue#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark34/6.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark34/6.txt index 89cf24d874744..f8169a34a98ff 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark34/6.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark34/6.txt @@ -52,7 +52,7 @@ Arguments: X, X (7) ColumnarExchange Input [2]: [sum#X, isEmpty#X] -Arguments: SinglePartition, ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] +Arguments: SinglePartition, ENSURE_REQUIREMENTS, [plan_id=X], [shuffle_writer_type=hash] (8) ShuffleQueryStage Output [2]: [sum#X, isEmpty#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark34/7.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark34/7.txt index 6284f06e5d2c5..9f03472f4bf5e 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark34/7.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark34/7.txt @@ -153,7 +153,7 @@ Arguments: X, X (6) ColumnarExchange Input [3]: [hash_partition_key#X, s_suppkey#X, s_nationkey#X] -Arguments: hashpartitioning(s_suppkey#X, 1), ENSURE_REQUIREMENTS, [s_suppkey#X, s_nationkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(s_suppkey#X, 1), ENSURE_REQUIREMENTS, [s_suppkey#X, s_nationkey#X], [plan_id=X], [shuffle_writer_type=hash] (7) ShuffleQueryStage Output [2]: [s_suppkey#X, s_nationkey#X] @@ -190,7 +190,7 @@ Arguments: X, X (15) ColumnarExchange Input [6]: [hash_partition_key#X, l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] -Arguments: hashpartitioning(l_suppkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(l_suppkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X], [plan_id=X], [shuffle_writer_type=hash] (16) ShuffleQueryStage Output [5]: [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] @@ -222,7 +222,7 @@ Arguments: X, X (23) ColumnarExchange Input [6]: [hash_partition_key#X, s_nationkey#X, l_orderkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] -Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [s_nationkey#X, l_orderkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [s_nationkey#X, l_orderkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X], [plan_id=X], [shuffle_writer_type=hash] (24) ShuffleQueryStage Output [5]: [s_nationkey#X, l_orderkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] @@ -259,7 +259,7 @@ Arguments: X, X (32) ColumnarExchange Input [3]: [hash_partition_key#X, o_orderkey#X, o_custkey#X] -Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [o_orderkey#X, o_custkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [o_orderkey#X, o_custkey#X], [plan_id=X], [shuffle_writer_type=hash] (33) ShuffleQueryStage Output [2]: [o_orderkey#X, o_custkey#X] @@ -291,7 +291,7 @@ Arguments: X, X (40) ColumnarExchange Input [6]: [hash_partition_key#X, s_nationkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, o_custkey#X] -Arguments: hashpartitioning(o_custkey#X, 1), ENSURE_REQUIREMENTS, [s_nationkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, o_custkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(o_custkey#X, 1), ENSURE_REQUIREMENTS, [s_nationkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, o_custkey#X], [plan_id=X], [shuffle_writer_type=hash] (41) ShuffleQueryStage Output [5]: [s_nationkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, o_custkey#X] @@ -328,7 +328,7 @@ Arguments: X, X (49) ColumnarExchange Input [3]: [hash_partition_key#X, c_custkey#X, c_nationkey#X] -Arguments: hashpartitioning(c_custkey#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X, c_nationkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(c_custkey#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X, c_nationkey#X], [plan_id=X], [shuffle_writer_type=hash] (50) ShuffleQueryStage Output [2]: [c_custkey#X, c_nationkey#X] @@ -360,7 +360,7 @@ Arguments: X, X (57) ColumnarExchange Input [6]: [hash_partition_key#X, s_nationkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, c_nationkey#X] -Arguments: hashpartitioning(s_nationkey#X, 1), ENSURE_REQUIREMENTS, [s_nationkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, c_nationkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(s_nationkey#X, 1), ENSURE_REQUIREMENTS, [s_nationkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, c_nationkey#X], [plan_id=X], [shuffle_writer_type=hash] (58) ShuffleQueryStage Output [5]: [s_nationkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, c_nationkey#X] @@ -397,7 +397,7 @@ Arguments: X, X (66) ColumnarExchange Input [3]: [hash_partition_key#X, n_nationkey#X, n_name#X] -Arguments: hashpartitioning(n_nationkey#X, 1), ENSURE_REQUIREMENTS, [n_nationkey#X, n_name#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(n_nationkey#X, 1), ENSURE_REQUIREMENTS, [n_nationkey#X, n_name#X], [plan_id=X], [shuffle_writer_type=hash] (67) ShuffleQueryStage Output [2]: [n_nationkey#X, n_name#X] @@ -429,7 +429,7 @@ Arguments: X, X (74) ColumnarExchange Input [6]: [hash_partition_key#X, l_extendedprice#X, l_discount#X, l_shipdate#X, c_nationkey#X, n_name#X] -Arguments: hashpartitioning(c_nationkey#X, 1), ENSURE_REQUIREMENTS, [l_extendedprice#X, l_discount#X, l_shipdate#X, c_nationkey#X, n_name#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(c_nationkey#X, 1), ENSURE_REQUIREMENTS, [l_extendedprice#X, l_discount#X, l_shipdate#X, c_nationkey#X, n_name#X], [plan_id=X], [shuffle_writer_type=hash] (75) ShuffleQueryStage Output [5]: [l_extendedprice#X, l_discount#X, l_shipdate#X, c_nationkey#X, n_name#X] @@ -485,7 +485,7 @@ Arguments: X, X (88) ColumnarExchange Input [6]: [hash_partition_key#X, supp_nation#X, cust_nation#X, l_year#X, sum#X, isEmpty#X] -Arguments: hashpartitioning(supp_nation#X, cust_nation#X, l_year#X, 1), ENSURE_REQUIREMENTS, [supp_nation#X, cust_nation#X, l_year#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(supp_nation#X, cust_nation#X, l_year#X, 1), ENSURE_REQUIREMENTS, [supp_nation#X, cust_nation#X, l_year#X, sum#X, isEmpty#X], [plan_id=X], [shuffle_writer_type=hash] (89) ShuffleQueryStage Output [5]: [supp_nation#X, cust_nation#X, l_year#X, sum#X, isEmpty#X] @@ -514,7 +514,7 @@ Arguments: X, X (95) ColumnarExchange Input [4]: [supp_nation#X, cust_nation#X, l_year#X, revenue#X] -Arguments: rangepartitioning(supp_nation#X ASC NULLS FIRST, cust_nation#X ASC NULLS FIRST, l_year#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] +Arguments: rangepartitioning(supp_nation#X ASC NULLS FIRST, cust_nation#X ASC NULLS FIRST, l_year#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [shuffle_writer_type=hash] (96) ShuffleQueryStage Output [4]: [supp_nation#X, cust_nation#X, l_year#X, revenue#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark34/8.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark34/8.txt index 2dc7abc112bd4..e858f7a1e9994 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark34/8.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark34/8.txt @@ -202,7 +202,7 @@ Arguments: X, X (6) ColumnarExchange Input [2]: [hash_partition_key#X, p_partkey#X] -Arguments: hashpartitioning(p_partkey#X, 1), ENSURE_REQUIREMENTS, [p_partkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(p_partkey#X, 1), ENSURE_REQUIREMENTS, [p_partkey#X], [plan_id=X], [shuffle_writer_type=hash] (7) ShuffleQueryStage Output [1]: [p_partkey#X] @@ -239,7 +239,7 @@ Arguments: X, X (15) ColumnarExchange Input [6]: [hash_partition_key#X, l_orderkey#X, l_partkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] -Arguments: hashpartitioning(l_partkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_partkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(l_partkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_partkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X], [plan_id=X], [shuffle_writer_type=hash] (16) ShuffleQueryStage Output [5]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] @@ -271,7 +271,7 @@ Arguments: X, X (23) ColumnarExchange Input [5]: [hash_partition_key#X, l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] -Arguments: hashpartitioning(l_suppkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(l_suppkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X], [plan_id=X], [shuffle_writer_type=hash] (24) ShuffleQueryStage Output [4]: [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] @@ -308,7 +308,7 @@ Arguments: X, X (32) ColumnarExchange Input [3]: [hash_partition_key#X, s_suppkey#X, s_nationkey#X] -Arguments: hashpartitioning(s_suppkey#X, 1), ENSURE_REQUIREMENTS, [s_suppkey#X, s_nationkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(s_suppkey#X, 1), ENSURE_REQUIREMENTS, [s_suppkey#X, s_nationkey#X], [plan_id=X], [shuffle_writer_type=hash] (33) ShuffleQueryStage Output [2]: [s_suppkey#X, s_nationkey#X] @@ -340,7 +340,7 @@ Arguments: X, X (40) ColumnarExchange Input [5]: [hash_partition_key#X, l_orderkey#X, l_extendedprice#X, l_discount#X, s_nationkey#X] -Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_extendedprice#X, l_discount#X, s_nationkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_extendedprice#X, l_discount#X, s_nationkey#X], [plan_id=X], [shuffle_writer_type=hash] (41) ShuffleQueryStage Output [4]: [l_orderkey#X, l_extendedprice#X, l_discount#X, s_nationkey#X] @@ -377,7 +377,7 @@ Arguments: X, X (49) ColumnarExchange Input [4]: [hash_partition_key#X, o_orderkey#X, o_custkey#X, o_orderdate#X] -Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [o_orderkey#X, o_custkey#X, o_orderdate#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [o_orderkey#X, o_custkey#X, o_orderdate#X], [plan_id=X], [shuffle_writer_type=hash] (50) ShuffleQueryStage Output [3]: [o_orderkey#X, o_custkey#X, o_orderdate#X] @@ -409,7 +409,7 @@ Arguments: X, X (57) ColumnarExchange Input [6]: [hash_partition_key#X, l_extendedprice#X, l_discount#X, s_nationkey#X, o_custkey#X, o_orderdate#X] -Arguments: hashpartitioning(o_custkey#X, 1), ENSURE_REQUIREMENTS, [l_extendedprice#X, l_discount#X, s_nationkey#X, o_custkey#X, o_orderdate#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(o_custkey#X, 1), ENSURE_REQUIREMENTS, [l_extendedprice#X, l_discount#X, s_nationkey#X, o_custkey#X, o_orderdate#X], [plan_id=X], [shuffle_writer_type=hash] (58) ShuffleQueryStage Output [5]: [l_extendedprice#X, l_discount#X, s_nationkey#X, o_custkey#X, o_orderdate#X] @@ -446,7 +446,7 @@ Arguments: X, X (66) ColumnarExchange Input [3]: [hash_partition_key#X, c_custkey#X, c_nationkey#X] -Arguments: hashpartitioning(c_custkey#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X, c_nationkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(c_custkey#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X, c_nationkey#X], [plan_id=X], [shuffle_writer_type=hash] (67) ShuffleQueryStage Output [2]: [c_custkey#X, c_nationkey#X] @@ -478,7 +478,7 @@ Arguments: X, X (74) ColumnarExchange Input [6]: [hash_partition_key#X, l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderdate#X, c_nationkey#X] -Arguments: hashpartitioning(c_nationkey#X, 1), ENSURE_REQUIREMENTS, [l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderdate#X, c_nationkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(c_nationkey#X, 1), ENSURE_REQUIREMENTS, [l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderdate#X, c_nationkey#X], [plan_id=X], [shuffle_writer_type=hash] (75) ShuffleQueryStage Output [5]: [l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderdate#X, c_nationkey#X] @@ -515,7 +515,7 @@ Arguments: X, X (83) ColumnarExchange Input [3]: [hash_partition_key#X, n_nationkey#X, n_regionkey#X] -Arguments: hashpartitioning(n_nationkey#X, 1), ENSURE_REQUIREMENTS, [n_nationkey#X, n_regionkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(n_nationkey#X, 1), ENSURE_REQUIREMENTS, [n_nationkey#X, n_regionkey#X], [plan_id=X], [shuffle_writer_type=hash] (84) ShuffleQueryStage Output [2]: [n_nationkey#X, n_regionkey#X] @@ -547,7 +547,7 @@ Arguments: X, X (91) ColumnarExchange Input [6]: [hash_partition_key#X, l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderdate#X, n_regionkey#X] -Arguments: hashpartitioning(s_nationkey#X, 1), ENSURE_REQUIREMENTS, [l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderdate#X, n_regionkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(s_nationkey#X, 1), ENSURE_REQUIREMENTS, [l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderdate#X, n_regionkey#X], [plan_id=X], [shuffle_writer_type=hash] (92) ShuffleQueryStage Output [5]: [l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderdate#X, n_regionkey#X] @@ -584,7 +584,7 @@ Arguments: X, X (100) ColumnarExchange Input [3]: [hash_partition_key#X, n_nationkey#X, n_name#X] -Arguments: hashpartitioning(n_nationkey#X, 1), ENSURE_REQUIREMENTS, [n_nationkey#X, n_name#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(n_nationkey#X, 1), ENSURE_REQUIREMENTS, [n_nationkey#X, n_name#X], [plan_id=X], [shuffle_writer_type=hash] (101) ShuffleQueryStage Output [2]: [n_nationkey#X, n_name#X] @@ -616,7 +616,7 @@ Arguments: X, X (108) ColumnarExchange Input [6]: [hash_partition_key#X, l_extendedprice#X, l_discount#X, o_orderdate#X, n_regionkey#X, n_name#X] -Arguments: hashpartitioning(n_regionkey#X, 1), ENSURE_REQUIREMENTS, [l_extendedprice#X, l_discount#X, o_orderdate#X, n_regionkey#X, n_name#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(n_regionkey#X, 1), ENSURE_REQUIREMENTS, [l_extendedprice#X, l_discount#X, o_orderdate#X, n_regionkey#X, n_name#X], [plan_id=X], [shuffle_writer_type=hash] (109) ShuffleQueryStage Output [5]: [l_extendedprice#X, l_discount#X, o_orderdate#X, n_regionkey#X, n_name#X] @@ -653,7 +653,7 @@ Arguments: X, X (117) ColumnarExchange Input [2]: [hash_partition_key#X, r_regionkey#X] -Arguments: hashpartitioning(r_regionkey#X, 1), ENSURE_REQUIREMENTS, [r_regionkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(r_regionkey#X, 1), ENSURE_REQUIREMENTS, [r_regionkey#X], [plan_id=X], [shuffle_writer_type=hash] (118) ShuffleQueryStage Output [1]: [r_regionkey#X] @@ -696,7 +696,7 @@ Arguments: X, X (127) ColumnarExchange Input [6]: [hash_partition_key#X, o_year#X, sum#X, isEmpty#X, sum#X, isEmpty#X] -Arguments: hashpartitioning(o_year#X, 1), ENSURE_REQUIREMENTS, [o_year#X, sum#X, isEmpty#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(o_year#X, 1), ENSURE_REQUIREMENTS, [o_year#X, sum#X, isEmpty#X, sum#X, isEmpty#X], [plan_id=X], [shuffle_writer_type=hash] (128) ShuffleQueryStage Output [5]: [o_year#X, sum#X, isEmpty#X, sum#X, isEmpty#X] @@ -729,7 +729,7 @@ Arguments: X, X (135) ColumnarExchange Input [2]: [o_year#X, mkt_share#X] -Arguments: rangepartitioning(o_year#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] +Arguments: rangepartitioning(o_year#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [shuffle_writer_type=hash] (136) ShuffleQueryStage Output [2]: [o_year#X, mkt_share#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark34/9.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark34/9.txt index ce095ca705bcf..39b2996cde0e9 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark34/9.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1-ras/spark34/9.txt @@ -158,7 +158,7 @@ Arguments: X, X (6) ColumnarExchange Input [2]: [hash_partition_key#X, p_partkey#X] -Arguments: hashpartitioning(p_partkey#X, 1), ENSURE_REQUIREMENTS, [p_partkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(p_partkey#X, 1), ENSURE_REQUIREMENTS, [p_partkey#X], [plan_id=X], [shuffle_writer_type=hash] (7) ShuffleQueryStage Output [1]: [p_partkey#X] @@ -195,7 +195,7 @@ Arguments: X, X (15) ColumnarExchange Input [7]: [hash_partition_key#X, l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] -Arguments: hashpartitioning(l_partkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(l_partkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X], [plan_id=X], [shuffle_writer_type=hash] (16) ShuffleQueryStage Output [6]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] @@ -227,7 +227,7 @@ Arguments: X, X (23) ColumnarExchange Input [7]: [hash_partition_key#X, l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] -Arguments: hashpartitioning(l_suppkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(l_suppkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X], [plan_id=X], [shuffle_writer_type=hash] (24) ShuffleQueryStage Output [6]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] @@ -264,7 +264,7 @@ Arguments: X, X (32) ColumnarExchange Input [3]: [hash_partition_key#X, s_suppkey#X, s_nationkey#X] -Arguments: hashpartitioning(s_suppkey#X, 1), ENSURE_REQUIREMENTS, [s_suppkey#X, s_nationkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(s_suppkey#X, 1), ENSURE_REQUIREMENTS, [s_suppkey#X, s_nationkey#X], [plan_id=X], [shuffle_writer_type=hash] (33) ShuffleQueryStage Output [2]: [s_suppkey#X, s_nationkey#X] @@ -296,7 +296,7 @@ Arguments: X, X (40) ColumnarExchange Input [8]: [hash_partition_key#X, l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X] -Arguments: hashpartitioning(l_suppkey#X, l_partkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(l_suppkey#X, l_partkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X], [plan_id=X], [shuffle_writer_type=hash] (41) ShuffleQueryStage Output [7]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X] @@ -333,7 +333,7 @@ Arguments: X, X (49) ColumnarExchange Input [4]: [hash_partition_key#X, ps_partkey#X, ps_suppkey#X, ps_supplycost#X] -Arguments: hashpartitioning(ps_suppkey#X, ps_partkey#X, 1), ENSURE_REQUIREMENTS, [ps_partkey#X, ps_suppkey#X, ps_supplycost#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(ps_suppkey#X, ps_partkey#X, 1), ENSURE_REQUIREMENTS, [ps_partkey#X, ps_suppkey#X, ps_supplycost#X], [plan_id=X], [shuffle_writer_type=hash] (50) ShuffleQueryStage Output [3]: [ps_partkey#X, ps_suppkey#X, ps_supplycost#X] @@ -365,7 +365,7 @@ Arguments: X, X (57) ColumnarExchange Input [7]: [hash_partition_key#X, l_orderkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_supplycost#X] -Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_supplycost#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_supplycost#X], [plan_id=X], [shuffle_writer_type=hash] (58) ShuffleQueryStage Output [6]: [l_orderkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_supplycost#X] @@ -402,7 +402,7 @@ Arguments: X, X (66) ColumnarExchange Input [3]: [hash_partition_key#X, o_orderkey#X, o_orderdate#X] -Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [o_orderkey#X, o_orderdate#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [o_orderkey#X, o_orderdate#X], [plan_id=X], [shuffle_writer_type=hash] (67) ShuffleQueryStage Output [2]: [o_orderkey#X, o_orderdate#X] @@ -434,7 +434,7 @@ Arguments: X, X (74) ColumnarExchange Input [7]: [hash_partition_key#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_supplycost#X, o_orderdate#X] -Arguments: hashpartitioning(s_nationkey#X, 1), ENSURE_REQUIREMENTS, [l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_supplycost#X, o_orderdate#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(s_nationkey#X, 1), ENSURE_REQUIREMENTS, [l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_supplycost#X, o_orderdate#X], [plan_id=X], [shuffle_writer_type=hash] (75) ShuffleQueryStage Output [6]: [l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_supplycost#X, o_orderdate#X] @@ -471,7 +471,7 @@ Arguments: X, X (83) ColumnarExchange Input [3]: [hash_partition_key#X, n_nationkey#X, n_name#X] -Arguments: hashpartitioning(n_nationkey#X, 1), ENSURE_REQUIREMENTS, [n_nationkey#X, n_name#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(n_nationkey#X, 1), ENSURE_REQUIREMENTS, [n_nationkey#X, n_name#X], [plan_id=X], [shuffle_writer_type=hash] (84) ShuffleQueryStage Output [2]: [n_nationkey#X, n_name#X] @@ -514,7 +514,7 @@ Arguments: X, X (93) ColumnarExchange Input [5]: [hash_partition_key#X, nation#X, o_year#X, sum#X, isEmpty#X] -Arguments: hashpartitioning(nation#X, o_year#X, 1), ENSURE_REQUIREMENTS, [nation#X, o_year#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(nation#X, o_year#X, 1), ENSURE_REQUIREMENTS, [nation#X, o_year#X, sum#X, isEmpty#X], [plan_id=X], [shuffle_writer_type=hash] (94) ShuffleQueryStage Output [4]: [nation#X, o_year#X, sum#X, isEmpty#X] @@ -543,7 +543,7 @@ Arguments: X, X (100) ColumnarExchange Input [3]: [nation#X, o_year#X, sum_profit#X] -Arguments: rangepartitioning(nation#X ASC NULLS FIRST, o_year#X DESC NULLS LAST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] +Arguments: rangepartitioning(nation#X ASC NULLS FIRST, o_year#X DESC NULLS LAST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [shuffle_writer_type=hash] (101) ShuffleQueryStage Output [3]: [nation#X, o_year#X, sum_profit#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark32/1.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark32/1.txt index 5ceb73b301db8..b1b2a7507c210 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark32/1.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark32/1.txt @@ -64,7 +64,7 @@ Arguments: X, X (8) ColumnarExchange Input [18]: [hash_partition_key#X, l_returnflag#X, l_linestatus#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, count#X, sum#X, count#X, sum#X, count#X, count#X] -Arguments: hashpartitioning(l_returnflag#X, l_linestatus#X, 1), ENSURE_REQUIREMENTS, [l_returnflag#X, l_linestatus#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, count#X, sum#X, count#X, sum#X, count#X, count#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(l_returnflag#X, l_linestatus#X, 1), ENSURE_REQUIREMENTS, [l_returnflag#X, l_linestatus#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, count#X, sum#X, count#X, sum#X, count#X, count#X], [plan_id=X], [shuffle_writer_type=hash] (9) ShuffleQueryStage Output [17]: [l_returnflag#X, l_linestatus#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, count#X, sum#X, count#X, sum#X, count#X, count#X] @@ -93,7 +93,7 @@ Arguments: X, X (15) ColumnarExchange Input [10]: [l_returnflag#X, l_linestatus#X, sum_qty#X, sum_base_price#X, sum_disc_price#X, sum_charge#X, avg_qty#X, avg_price#X, avg_disc#X, count_order#X] -Arguments: rangepartitioning(l_returnflag#X ASC NULLS FIRST, l_linestatus#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] +Arguments: rangepartitioning(l_returnflag#X ASC NULLS FIRST, l_linestatus#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [shuffle_writer_type=hash] (16) ShuffleQueryStage Output [10]: [l_returnflag#X, l_linestatus#X, sum_qty#X, sum_base_price#X, sum_disc_price#X, sum_charge#X, avg_qty#X, avg_price#X, avg_disc#X, count_order#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark32/10.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark32/10.txt index b6c3dc67a1a5d..6bb1784778cd6 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark32/10.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark32/10.txt @@ -113,7 +113,7 @@ Arguments: X, X (6) ColumnarExchange Input [8]: [hash_partition_key#X, c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X] -Arguments: hashpartitioning(c_custkey#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(c_custkey#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X], [plan_id=X], [shuffle_writer_type=hash] (7) ShuffleQueryStage Output [7]: [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X] @@ -150,7 +150,7 @@ Arguments: X, X (15) ColumnarExchange Input [3]: [hash_partition_key#X, o_orderkey#X, o_custkey#X] -Arguments: hashpartitioning(o_custkey#X, 1), ENSURE_REQUIREMENTS, [o_orderkey#X, o_custkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(o_custkey#X, 1), ENSURE_REQUIREMENTS, [o_orderkey#X, o_custkey#X], [plan_id=X], [shuffle_writer_type=hash] (16) ShuffleQueryStage Output [2]: [o_orderkey#X, o_custkey#X] @@ -181,7 +181,7 @@ Arguments: X, X (23) ColumnarExchange Input [9]: [hash_partition_key#X, c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, o_orderkey#X] -Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, o_orderkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, o_orderkey#X], [plan_id=X], [shuffle_writer_type=hash] (24) ShuffleQueryStage Output [8]: [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, o_orderkey#X] @@ -218,7 +218,7 @@ Arguments: X, X (32) ColumnarExchange Input [4]: [hash_partition_key#X, l_orderkey#X, l_extendedprice#X, l_discount#X] -Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_extendedprice#X, l_discount#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_extendedprice#X, l_discount#X], [plan_id=X], [shuffle_writer_type=hash] (33) ShuffleQueryStage Output [3]: [l_orderkey#X, l_extendedprice#X, l_discount#X] @@ -249,7 +249,7 @@ Arguments: X, X (40) ColumnarExchange Input [10]: [hash_partition_key#X, c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, l_extendedprice#X, l_discount#X] -Arguments: hashpartitioning(c_nationkey#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, l_extendedprice#X, l_discount#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(c_nationkey#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, l_extendedprice#X, l_discount#X], [plan_id=X], [shuffle_writer_type=hash] (41) ShuffleQueryStage Output [9]: [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, l_extendedprice#X, l_discount#X] @@ -286,7 +286,7 @@ Arguments: X, X (49) ColumnarExchange Input [3]: [hash_partition_key#X, n_nationkey#X, n_name#X] -Arguments: hashpartitioning(n_nationkey#X, 1), ENSURE_REQUIREMENTS, [n_nationkey#X, n_name#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(n_nationkey#X, 1), ENSURE_REQUIREMENTS, [n_nationkey#X, n_name#X], [plan_id=X], [shuffle_writer_type=hash] (50) ShuffleQueryStage Output [2]: [n_nationkey#X, n_name#X] @@ -328,7 +328,7 @@ Arguments: X, X (59) ColumnarExchange Input [10]: [hash_partition_key#X, c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, sum#X, isEmpty#X] -Arguments: hashpartitioning(c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, sum#X, isEmpty#X], [plan_id=X], [shuffle_writer_type=hash] (60) ShuffleQueryStage Output [9]: [c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, sum#X, isEmpty#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark32/11.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark32/11.txt index 48d14d1812d18..42e37d72f12d8 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark32/11.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark32/11.txt @@ -97,7 +97,7 @@ Arguments: X, X (6) ColumnarExchange Input [5]: [hash_partition_key#X, ps_partkey#X, ps_suppkey#X, ps_availqty#X, ps_supplycost#X] -Arguments: hashpartitioning(ps_suppkey#X, 1), ENSURE_REQUIREMENTS, [ps_partkey#X, ps_suppkey#X, ps_availqty#X, ps_supplycost#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(ps_suppkey#X, 1), ENSURE_REQUIREMENTS, [ps_partkey#X, ps_suppkey#X, ps_availqty#X, ps_supplycost#X], [plan_id=X], [shuffle_writer_type=hash] (7) ShuffleQueryStage Output [4]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X, ps_supplycost#X] @@ -134,7 +134,7 @@ Arguments: X, X (15) ColumnarExchange Input [3]: [hash_partition_key#X, s_suppkey#X, s_nationkey#X] -Arguments: hashpartitioning(s_suppkey#X, 1), ENSURE_REQUIREMENTS, [s_suppkey#X, s_nationkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(s_suppkey#X, 1), ENSURE_REQUIREMENTS, [s_suppkey#X, s_nationkey#X], [plan_id=X], [shuffle_writer_type=hash] (16) ShuffleQueryStage Output [2]: [s_suppkey#X, s_nationkey#X] @@ -165,7 +165,7 @@ Arguments: X, X (23) ColumnarExchange Input [5]: [hash_partition_key#X, ps_partkey#X, ps_availqty#X, ps_supplycost#X, s_nationkey#X] -Arguments: hashpartitioning(s_nationkey#X, 1), ENSURE_REQUIREMENTS, [ps_partkey#X, ps_availqty#X, ps_supplycost#X, s_nationkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(s_nationkey#X, 1), ENSURE_REQUIREMENTS, [ps_partkey#X, ps_availqty#X, ps_supplycost#X, s_nationkey#X], [plan_id=X], [shuffle_writer_type=hash] (24) ShuffleQueryStage Output [4]: [ps_partkey#X, ps_availqty#X, ps_supplycost#X, s_nationkey#X] @@ -202,7 +202,7 @@ Arguments: X, X (32) ColumnarExchange Input [2]: [hash_partition_key#X, n_nationkey#X] -Arguments: hashpartitioning(n_nationkey#X, 1), ENSURE_REQUIREMENTS, [n_nationkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(n_nationkey#X, 1), ENSURE_REQUIREMENTS, [n_nationkey#X], [plan_id=X], [shuffle_writer_type=hash] (33) ShuffleQueryStage Output [1]: [n_nationkey#X] @@ -244,7 +244,7 @@ Arguments: X, X (42) ColumnarExchange Input [4]: [hash_partition_key#X, ps_partkey#X, sum#X, isEmpty#X] -Arguments: hashpartitioning(ps_partkey#X, 1), ENSURE_REQUIREMENTS, [ps_partkey#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(ps_partkey#X, 1), ENSURE_REQUIREMENTS, [ps_partkey#X, sum#X, isEmpty#X], [plan_id=X], [shuffle_writer_type=hash] (43) ShuffleQueryStage Output [3]: [ps_partkey#X, sum#X, isEmpty#X] @@ -277,7 +277,7 @@ Arguments: X, X (50) ColumnarExchange Input [2]: [ps_partkey#X, value#X] -Arguments: rangepartitioning(value#X DESC NULLS LAST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] +Arguments: rangepartitioning(value#X DESC NULLS LAST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [shuffle_writer_type=hash] (51) ShuffleQueryStage Output [2]: [ps_partkey#X, value#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark32/12.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark32/12.txt index 0a836a6d8ed77..40f651dd347e0 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark32/12.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark32/12.txt @@ -74,7 +74,7 @@ Arguments: X, X (6) ColumnarExchange Input [3]: [hash_partition_key#X, o_orderkey#X, o_orderpriority#X] -Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [o_orderkey#X, o_orderpriority#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [o_orderkey#X, o_orderpriority#X], [plan_id=X], [shuffle_writer_type=hash] (7) ShuffleQueryStage Output [2]: [o_orderkey#X, o_orderpriority#X] @@ -111,7 +111,7 @@ Arguments: X, X (15) ColumnarExchange Input [3]: [hash_partition_key#X, l_orderkey#X, l_shipmode#X] -Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_shipmode#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_shipmode#X], [plan_id=X], [shuffle_writer_type=hash] (16) ShuffleQueryStage Output [2]: [l_orderkey#X, l_shipmode#X] @@ -153,7 +153,7 @@ Arguments: X, X (25) ColumnarExchange Input [4]: [hash_partition_key#X, l_shipmode#X, sum#X, sum#X] -Arguments: hashpartitioning(l_shipmode#X, 1), ENSURE_REQUIREMENTS, [l_shipmode#X, sum#X, sum#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(l_shipmode#X, 1), ENSURE_REQUIREMENTS, [l_shipmode#X, sum#X, sum#X], [plan_id=X], [shuffle_writer_type=hash] (26) ShuffleQueryStage Output [3]: [l_shipmode#X, sum#X, sum#X] @@ -182,7 +182,7 @@ Arguments: X, X (32) ColumnarExchange Input [3]: [l_shipmode#X, high_line_count#X, low_line_count#X] -Arguments: rangepartitioning(l_shipmode#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] +Arguments: rangepartitioning(l_shipmode#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [shuffle_writer_type=hash] (33) ShuffleQueryStage Output [3]: [l_shipmode#X, high_line_count#X, low_line_count#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark32/13.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark32/13.txt index 522a695f73b16..6c48d7e59901d 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark32/13.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark32/13.txt @@ -72,7 +72,7 @@ Arguments: X, X (5) ColumnarExchange Input [2]: [hash_partition_key#X, c_custkey#X] -Arguments: hashpartitioning(c_custkey#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(c_custkey#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X], [plan_id=X], [shuffle_writer_type=hash] (6) ShuffleQueryStage Output [1]: [c_custkey#X] @@ -109,7 +109,7 @@ Arguments: X, X (14) ColumnarExchange Input [3]: [hash_partition_key#X, o_orderkey#X, o_custkey#X] -Arguments: hashpartitioning(o_custkey#X, 1), ENSURE_REQUIREMENTS, [o_orderkey#X, o_custkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(o_custkey#X, 1), ENSURE_REQUIREMENTS, [o_orderkey#X, o_custkey#X], [plan_id=X], [shuffle_writer_type=hash] (15) ShuffleQueryStage Output [2]: [o_orderkey#X, o_custkey#X] @@ -169,7 +169,7 @@ Arguments: X, X (27) ColumnarExchange Input [3]: [hash_partition_key#X, c_count#X, count#X] -Arguments: hashpartitioning(c_count#X, 1), ENSURE_REQUIREMENTS, [c_count#X, count#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(c_count#X, 1), ENSURE_REQUIREMENTS, [c_count#X, count#X], [plan_id=X], [shuffle_writer_type=hash] (28) ShuffleQueryStage Output [2]: [c_count#X, count#X] @@ -198,7 +198,7 @@ Arguments: X, X (34) ColumnarExchange Input [2]: [c_count#X, custdist#X] -Arguments: rangepartitioning(custdist#X DESC NULLS LAST, c_count#X DESC NULLS LAST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] +Arguments: rangepartitioning(custdist#X DESC NULLS LAST, c_count#X DESC NULLS LAST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [shuffle_writer_type=hash] (35) ShuffleQueryStage Output [2]: [c_count#X, custdist#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark32/14.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark32/14.txt index d050850332af2..b04d8d643fda8 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark32/14.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark32/14.txt @@ -62,7 +62,7 @@ Arguments: X, X (6) ColumnarExchange Input [4]: [hash_partition_key#X, l_partkey#X, l_extendedprice#X, l_discount#X] -Arguments: hashpartitioning(l_partkey#X, 1), ENSURE_REQUIREMENTS, [l_partkey#X, l_extendedprice#X, l_discount#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(l_partkey#X, 1), ENSURE_REQUIREMENTS, [l_partkey#X, l_extendedprice#X, l_discount#X], [plan_id=X], [shuffle_writer_type=hash] (7) ShuffleQueryStage Output [3]: [l_partkey#X, l_extendedprice#X, l_discount#X] @@ -99,7 +99,7 @@ Arguments: X, X (15) ColumnarExchange Input [3]: [hash_partition_key#X, p_partkey#X, p_type#X] -Arguments: hashpartitioning(p_partkey#X, 1), ENSURE_REQUIREMENTS, [p_partkey#X, p_type#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(p_partkey#X, 1), ENSURE_REQUIREMENTS, [p_partkey#X, p_type#X], [plan_id=X], [shuffle_writer_type=hash] (16) ShuffleQueryStage Output [2]: [p_partkey#X, p_type#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark32/15.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark32/15.txt index 1d085e52a5d4d..83bd88329661e 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark32/15.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark32/15.txt @@ -71,7 +71,7 @@ Arguments: X, X (6) ColumnarExchange Input [5]: [hash_partition_key#X, s_suppkey#X, s_name#X, s_address#X, s_phone#X] -Arguments: hashpartitioning(s_suppkey#X, 1), ENSURE_REQUIREMENTS, [s_suppkey#X, s_name#X, s_address#X, s_phone#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(s_suppkey#X, 1), ENSURE_REQUIREMENTS, [s_suppkey#X, s_name#X, s_address#X, s_phone#X], [plan_id=X], [shuffle_writer_type=hash] (7) ShuffleQueryStage Output [4]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X] @@ -119,7 +119,7 @@ Arguments: X, X (17) ColumnarExchange Input [4]: [hash_partition_key#X, l_suppkey#X, sum#X, isEmpty#X] -Arguments: hashpartitioning(l_suppkey#X, 1), ENSURE_REQUIREMENTS, [l_suppkey#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(l_suppkey#X, 1), ENSURE_REQUIREMENTS, [l_suppkey#X, sum#X, isEmpty#X], [plan_id=X], [shuffle_writer_type=hash] (18) ShuffleQueryStage Output [3]: [l_suppkey#X, sum#X, isEmpty#X] @@ -161,7 +161,7 @@ Arguments: X, X (27) ColumnarExchange Input [5]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X, total_revenue#X] -Arguments: rangepartitioning(s_suppkey#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] +Arguments: rangepartitioning(s_suppkey#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [shuffle_writer_type=hash] (28) ShuffleQueryStage Output [5]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X, total_revenue#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark32/16.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark32/16.txt index 4b6a18e968609..4c43ec8f19b68 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark32/16.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark32/16.txt @@ -88,7 +88,7 @@ Arguments: X, X (6) ColumnarExchange Input [3]: [hash_partition_key#X, ps_partkey#X, ps_suppkey#X] -Arguments: hashpartitioning(ps_partkey#X, 1), ENSURE_REQUIREMENTS, [ps_partkey#X, ps_suppkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(ps_partkey#X, 1), ENSURE_REQUIREMENTS, [ps_partkey#X, ps_suppkey#X], [plan_id=X], [shuffle_writer_type=hash] (7) ShuffleQueryStage Output [2]: [ps_partkey#X, ps_suppkey#X] @@ -125,7 +125,7 @@ Arguments: X, X (15) ColumnarExchange Input [5]: [hash_partition_key#X, p_partkey#X, p_brand#X, p_type#X, p_size#X] -Arguments: hashpartitioning(p_partkey#X, 1), ENSURE_REQUIREMENTS, [p_partkey#X, p_brand#X, p_type#X, p_size#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(p_partkey#X, 1), ENSURE_REQUIREMENTS, [p_partkey#X, p_brand#X, p_type#X, p_size#X], [plan_id=X], [shuffle_writer_type=hash] (16) ShuffleQueryStage Output [4]: [p_partkey#X, p_brand#X, p_type#X, p_size#X] @@ -167,7 +167,7 @@ Arguments: X, X (25) ColumnarExchange Input [5]: [hash_partition_key#X, p_brand#X, p_type#X, p_size#X, ps_suppkey#X] -Arguments: hashpartitioning(p_brand#X, p_type#X, p_size#X, ps_suppkey#X, 1), ENSURE_REQUIREMENTS, [p_brand#X, p_type#X, p_size#X, ps_suppkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(p_brand#X, p_type#X, p_size#X, ps_suppkey#X, 1), ENSURE_REQUIREMENTS, [p_brand#X, p_type#X, p_size#X, ps_suppkey#X], [plan_id=X], [shuffle_writer_type=hash] (26) ShuffleQueryStage Output [4]: [p_brand#X, p_type#X, p_size#X, ps_suppkey#X] @@ -207,7 +207,7 @@ Arguments: X, X (34) ColumnarExchange Input [5]: [hash_partition_key#X, p_brand#X, p_type#X, p_size#X, count#X] -Arguments: hashpartitioning(p_brand#X, p_type#X, p_size#X, 1), ENSURE_REQUIREMENTS, [p_brand#X, p_type#X, p_size#X, count#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(p_brand#X, p_type#X, p_size#X, 1), ENSURE_REQUIREMENTS, [p_brand#X, p_type#X, p_size#X, count#X], [plan_id=X], [shuffle_writer_type=hash] (35) ShuffleQueryStage Output [4]: [p_brand#X, p_type#X, p_size#X, count#X] @@ -236,7 +236,7 @@ Arguments: X, X (41) ColumnarExchange Input [4]: [p_brand#X, p_type#X, p_size#X, supplier_cnt#X] -Arguments: rangepartitioning(supplier_cnt#X DESC NULLS LAST, p_brand#X ASC NULLS FIRST, p_type#X ASC NULLS FIRST, p_size#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] +Arguments: rangepartitioning(supplier_cnt#X DESC NULLS LAST, p_brand#X ASC NULLS FIRST, p_type#X ASC NULLS FIRST, p_size#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [shuffle_writer_type=hash] (42) ShuffleQueryStage Output [4]: [p_brand#X, p_type#X, p_size#X, supplier_cnt#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark32/17.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark32/17.txt index 6ebaa345a5a99..6a9d19d0defe5 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark32/17.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark32/17.txt @@ -84,7 +84,7 @@ Arguments: X, X (6) ColumnarExchange Input [4]: [hash_partition_key#X, l_partkey#X, l_quantity#X, l_extendedprice#X] -Arguments: hashpartitioning(l_partkey#X, 1), ENSURE_REQUIREMENTS, [l_partkey#X, l_quantity#X, l_extendedprice#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(l_partkey#X, 1), ENSURE_REQUIREMENTS, [l_partkey#X, l_quantity#X, l_extendedprice#X], [plan_id=X], [shuffle_writer_type=hash] (7) ShuffleQueryStage Output [3]: [l_partkey#X, l_quantity#X, l_extendedprice#X] @@ -121,7 +121,7 @@ Arguments: X, X (15) ColumnarExchange Input [2]: [hash_partition_key#X, p_partkey#X] -Arguments: hashpartitioning(p_partkey#X, 1), ENSURE_REQUIREMENTS, [p_partkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(p_partkey#X, 1), ENSURE_REQUIREMENTS, [p_partkey#X], [plan_id=X], [shuffle_writer_type=hash] (16) ShuffleQueryStage Output [1]: [p_partkey#X] @@ -174,7 +174,7 @@ Arguments: X, X (27) ColumnarExchange Input [4]: [hash_partition_key#X, l_partkey#X, sum#X, count#X] -Arguments: hashpartitioning(l_partkey#X, 1), ENSURE_REQUIREMENTS, [l_partkey#X, sum#X, count#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(l_partkey#X, 1), ENSURE_REQUIREMENTS, [l_partkey#X, sum#X, count#X], [plan_id=X], [shuffle_writer_type=hash] (28) ShuffleQueryStage Output [3]: [l_partkey#X, sum#X, count#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark32/18.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark32/18.txt index de38e4d98b651..587eb37cd4c66 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark32/18.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark32/18.txt @@ -124,7 +124,7 @@ Arguments: X, X (6) ColumnarExchange Input [3]: [hash_partition_key#X, c_custkey#X, c_name#X] -Arguments: hashpartitioning(c_custkey#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X, c_name#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(c_custkey#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X, c_name#X], [plan_id=X], [shuffle_writer_type=hash] (7) ShuffleQueryStage Output [2]: [c_custkey#X, c_name#X] @@ -161,7 +161,7 @@ Arguments: X, X (15) ColumnarExchange Input [5]: [hash_partition_key#X, o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X] -Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X], [plan_id=X], [shuffle_writer_type=hash] (16) ShuffleQueryStage Output [4]: [o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X] @@ -200,7 +200,7 @@ Arguments: X, X (24) ColumnarExchange Input [4]: [hash_partition_key#X, l_orderkey#X, sum#X, isEmpty#X] -Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, sum#X, isEmpty#X], [plan_id=X], [shuffle_writer_type=hash] (25) ShuffleQueryStage Output [3]: [l_orderkey#X, sum#X, isEmpty#X] @@ -246,7 +246,7 @@ Arguments: X, X (35) ColumnarExchange Input [5]: [hash_partition_key#X, o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X] -Arguments: hashpartitioning(o_custkey#X, 1), ENSURE_REQUIREMENTS, [o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(o_custkey#X, 1), ENSURE_REQUIREMENTS, [o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X], [plan_id=X], [shuffle_writer_type=hash] (36) ShuffleQueryStage Output [4]: [o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X] @@ -277,7 +277,7 @@ Arguments: X, X (43) ColumnarExchange Input [6]: [hash_partition_key#X, c_custkey#X, c_name#X, o_orderkey#X, o_totalprice#X, o_orderdate#X] -Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X, c_name#X, o_orderkey#X, o_totalprice#X, o_orderdate#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X, c_name#X, o_orderkey#X, o_totalprice#X, o_orderdate#X], [plan_id=X], [shuffle_writer_type=hash] (44) ShuffleQueryStage Output [5]: [c_custkey#X, c_name#X, o_orderkey#X, o_totalprice#X, o_orderdate#X] @@ -314,7 +314,7 @@ Arguments: X, X (52) ColumnarExchange Input [3]: [hash_partition_key#X, l_orderkey#X, l_quantity#X] -Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_quantity#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_quantity#X], [plan_id=X], [shuffle_writer_type=hash] (53) ShuffleQueryStage Output [2]: [l_orderkey#X, l_quantity#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark32/19.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark32/19.txt index 84520a9699026..75795b3eaab52 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark32/19.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark32/19.txt @@ -61,7 +61,7 @@ Arguments: X, X (6) ColumnarExchange Input [5]: [hash_partition_key#X, l_partkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] -Arguments: hashpartitioning(l_partkey#X, 1), ENSURE_REQUIREMENTS, [l_partkey#X, l_quantity#X, l_extendedprice#X, l_discount#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(l_partkey#X, 1), ENSURE_REQUIREMENTS, [l_partkey#X, l_quantity#X, l_extendedprice#X, l_discount#X], [plan_id=X], [shuffle_writer_type=hash] (7) ShuffleQueryStage Output [4]: [l_partkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] @@ -98,7 +98,7 @@ Arguments: X, X (15) ColumnarExchange Input [5]: [hash_partition_key#X, p_partkey#X, p_brand#X, p_size#X, p_container#X] -Arguments: hashpartitioning(p_partkey#X, 1), ENSURE_REQUIREMENTS, [p_partkey#X, p_brand#X, p_size#X, p_container#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(p_partkey#X, 1), ENSURE_REQUIREMENTS, [p_partkey#X, p_brand#X, p_size#X, p_container#X], [plan_id=X], [shuffle_writer_type=hash] (16) ShuffleQueryStage Output [4]: [p_partkey#X, p_brand#X, p_size#X, p_container#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark32/20.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark32/20.txt index d581db858a0c1..ebcbc702cf472 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark32/20.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark32/20.txt @@ -152,7 +152,7 @@ Arguments: X, X (6) ColumnarExchange Input [5]: [hash_partition_key#X, s_suppkey#X, s_name#X, s_address#X, s_nationkey#X] -Arguments: hashpartitioning(s_suppkey#X, 1), ENSURE_REQUIREMENTS, [s_suppkey#X, s_name#X, s_address#X, s_nationkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(s_suppkey#X, 1), ENSURE_REQUIREMENTS, [s_suppkey#X, s_name#X, s_address#X, s_nationkey#X], [plan_id=X], [shuffle_writer_type=hash] (7) ShuffleQueryStage Output [4]: [s_suppkey#X, s_name#X, s_address#X, s_nationkey#X] @@ -189,7 +189,7 @@ Arguments: X, X (15) ColumnarExchange Input [4]: [hash_partition_key#X, ps_partkey#X, ps_suppkey#X, ps_availqty#X] -Arguments: hashpartitioning(ps_partkey#X, 1), ENSURE_REQUIREMENTS, [ps_partkey#X, ps_suppkey#X, ps_availqty#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(ps_partkey#X, 1), ENSURE_REQUIREMENTS, [ps_partkey#X, ps_suppkey#X, ps_availqty#X], [plan_id=X], [shuffle_writer_type=hash] (16) ShuffleQueryStage Output [3]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X] @@ -226,7 +226,7 @@ Arguments: X, X (24) ColumnarExchange Input [2]: [hash_partition_key#X, p_partkey#X] -Arguments: hashpartitioning(p_partkey#X, 1), ENSURE_REQUIREMENTS, [p_partkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(p_partkey#X, 1), ENSURE_REQUIREMENTS, [p_partkey#X], [plan_id=X], [shuffle_writer_type=hash] (25) ShuffleQueryStage Output [1]: [p_partkey#X] @@ -257,7 +257,7 @@ Arguments: X, X (32) ColumnarExchange Input [4]: [hash_partition_key#X, ps_partkey#X, ps_suppkey#X, ps_availqty#X] -Arguments: hashpartitioning(ps_partkey#X, ps_suppkey#X, 1), ENSURE_REQUIREMENTS, [ps_partkey#X, ps_suppkey#X, ps_availqty#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(ps_partkey#X, ps_suppkey#X, 1), ENSURE_REQUIREMENTS, [ps_partkey#X, ps_suppkey#X, ps_availqty#X], [plan_id=X], [shuffle_writer_type=hash] (33) ShuffleQueryStage Output [3]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X] @@ -294,7 +294,7 @@ Arguments: X, X (41) ColumnarExchange Input [4]: [hash_partition_key#X, l_partkey#X, l_suppkey#X, l_quantity#X] -Arguments: hashpartitioning(l_partkey#X, 1), ENSURE_REQUIREMENTS, [l_partkey#X, l_suppkey#X, l_quantity#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(l_partkey#X, 1), ENSURE_REQUIREMENTS, [l_partkey#X, l_suppkey#X, l_quantity#X], [plan_id=X], [shuffle_writer_type=hash] (42) ShuffleQueryStage Output [3]: [l_partkey#X, l_suppkey#X, l_quantity#X] @@ -360,7 +360,7 @@ Arguments: X, X (57) ColumnarExchange Input [4]: [hash_partition_key#X, (0.5 * sum(l_quantity))#X, l_partkey#X, l_suppkey#X] -Arguments: hashpartitioning(l_partkey#X, l_suppkey#X, 1), ENSURE_REQUIREMENTS, [(0.5 * sum(l_quantity))#X, l_partkey#X, l_suppkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(l_partkey#X, l_suppkey#X, 1), ENSURE_REQUIREMENTS, [(0.5 * sum(l_quantity))#X, l_partkey#X, l_suppkey#X], [plan_id=X], [shuffle_writer_type=hash] (58) ShuffleQueryStage Output [3]: [(0.5 * sum(l_quantity))#X, l_partkey#X, l_suppkey#X] @@ -391,7 +391,7 @@ Arguments: X, X (65) ColumnarExchange Input [2]: [hash_partition_key#X, ps_suppkey#X] -Arguments: hashpartitioning(ps_suppkey#X, 1), ENSURE_REQUIREMENTS, [ps_suppkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(ps_suppkey#X, 1), ENSURE_REQUIREMENTS, [ps_suppkey#X], [plan_id=X], [shuffle_writer_type=hash] (66) ShuffleQueryStage Output [1]: [ps_suppkey#X] @@ -422,7 +422,7 @@ Arguments: X, X (73) ColumnarExchange Input [4]: [hash_partition_key#X, s_name#X, s_address#X, s_nationkey#X] -Arguments: hashpartitioning(s_nationkey#X, 1), ENSURE_REQUIREMENTS, [s_name#X, s_address#X, s_nationkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(s_nationkey#X, 1), ENSURE_REQUIREMENTS, [s_name#X, s_address#X, s_nationkey#X], [plan_id=X], [shuffle_writer_type=hash] (74) ShuffleQueryStage Output [3]: [s_name#X, s_address#X, s_nationkey#X] @@ -459,7 +459,7 @@ Arguments: X, X (82) ColumnarExchange Input [2]: [hash_partition_key#X, n_nationkey#X] -Arguments: hashpartitioning(n_nationkey#X, 1), ENSURE_REQUIREMENTS, [n_nationkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(n_nationkey#X, 1), ENSURE_REQUIREMENTS, [n_nationkey#X], [plan_id=X], [shuffle_writer_type=hash] (83) ShuffleQueryStage Output [1]: [n_nationkey#X] @@ -490,7 +490,7 @@ Arguments: X, X (90) ColumnarExchange Input [2]: [s_name#X, s_address#X] -Arguments: rangepartitioning(s_name#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] +Arguments: rangepartitioning(s_name#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [shuffle_writer_type=hash] (91) ShuffleQueryStage Output [2]: [s_name#X, s_address#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark32/21.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark32/21.txt index c9375e9823a34..af2e469119be0 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark32/21.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark32/21.txt @@ -145,7 +145,7 @@ Arguments: X, X (6) ColumnarExchange Input [4]: [hash_partition_key#X, s_suppkey#X, s_name#X, s_nationkey#X] -Arguments: hashpartitioning(s_suppkey#X, 1), ENSURE_REQUIREMENTS, [s_suppkey#X, s_name#X, s_nationkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(s_suppkey#X, 1), ENSURE_REQUIREMENTS, [s_suppkey#X, s_name#X, s_nationkey#X], [plan_id=X], [shuffle_writer_type=hash] (7) ShuffleQueryStage Output [3]: [s_suppkey#X, s_name#X, s_nationkey#X] @@ -182,7 +182,7 @@ Arguments: X, X (15) ColumnarExchange Input [3]: [hash_partition_key#X, l_orderkey#X, l_suppkey#X] -Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_suppkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_suppkey#X], [plan_id=X], [shuffle_writer_type=hash] (16) ShuffleQueryStage Output [2]: [l_orderkey#X, l_suppkey#X] @@ -214,7 +214,7 @@ Arguments: X, X (23) ColumnarExchange Input [3]: [hash_partition_key#X, l_orderkey#X, l_suppkey#X] -Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_suppkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_suppkey#X], [plan_id=X], [shuffle_writer_type=hash] (24) ShuffleQueryStage Output [2]: [l_orderkey#X, l_suppkey#X] @@ -256,7 +256,7 @@ Arguments: X, X (33) ColumnarExchange Input [3]: [hash_partition_key#X, l_orderkey#X, l_suppkey#X] -Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_suppkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_suppkey#X], [plan_id=X], [shuffle_writer_type=hash] (34) ShuffleQueryStage Output [2]: [l_orderkey#X, l_suppkey#X] @@ -287,7 +287,7 @@ Arguments: X, X (41) ColumnarExchange Input [3]: [hash_partition_key#X, l_orderkey#X, l_suppkey#X] -Arguments: hashpartitioning(l_suppkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_suppkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(l_suppkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_suppkey#X], [plan_id=X], [shuffle_writer_type=hash] (42) ShuffleQueryStage Output [2]: [l_orderkey#X, l_suppkey#X] @@ -318,7 +318,7 @@ Arguments: X, X (49) ColumnarExchange Input [4]: [hash_partition_key#X, s_name#X, s_nationkey#X, l_orderkey#X] -Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [s_name#X, s_nationkey#X, l_orderkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [s_name#X, s_nationkey#X, l_orderkey#X], [plan_id=X], [shuffle_writer_type=hash] (50) ShuffleQueryStage Output [3]: [s_name#X, s_nationkey#X, l_orderkey#X] @@ -355,7 +355,7 @@ Arguments: X, X (58) ColumnarExchange Input [2]: [hash_partition_key#X, o_orderkey#X] -Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [o_orderkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [o_orderkey#X], [plan_id=X], [shuffle_writer_type=hash] (59) ShuffleQueryStage Output [1]: [o_orderkey#X] @@ -386,7 +386,7 @@ Arguments: X, X (66) ColumnarExchange Input [3]: [hash_partition_key#X, s_name#X, s_nationkey#X] -Arguments: hashpartitioning(s_nationkey#X, 1), ENSURE_REQUIREMENTS, [s_name#X, s_nationkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(s_nationkey#X, 1), ENSURE_REQUIREMENTS, [s_name#X, s_nationkey#X], [plan_id=X], [shuffle_writer_type=hash] (67) ShuffleQueryStage Output [2]: [s_name#X, s_nationkey#X] @@ -423,7 +423,7 @@ Arguments: X, X (75) ColumnarExchange Input [2]: [hash_partition_key#X, n_nationkey#X] -Arguments: hashpartitioning(n_nationkey#X, 1), ENSURE_REQUIREMENTS, [n_nationkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(n_nationkey#X, 1), ENSURE_REQUIREMENTS, [n_nationkey#X], [plan_id=X], [shuffle_writer_type=hash] (76) ShuffleQueryStage Output [1]: [n_nationkey#X] @@ -465,7 +465,7 @@ Arguments: X, X (85) ColumnarExchange Input [3]: [hash_partition_key#X, s_name#X, count#X] -Arguments: hashpartitioning(s_name#X, 1), ENSURE_REQUIREMENTS, [s_name#X, count#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(s_name#X, 1), ENSURE_REQUIREMENTS, [s_name#X, count#X], [plan_id=X], [shuffle_writer_type=hash] (86) ShuffleQueryStage Output [2]: [s_name#X, count#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark32/22.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark32/22.txt index 9e899a1b5e15b..7e43e67ab1e62 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark32/22.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark32/22.txt @@ -71,7 +71,7 @@ Arguments: X, X (6) ColumnarExchange Input [4]: [hash_partition_key#X, c_custkey#X, c_phone#X, c_acctbal#X] -Arguments: hashpartitioning(c_custkey#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X, c_phone#X, c_acctbal#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(c_custkey#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X, c_phone#X, c_acctbal#X], [plan_id=X], [shuffle_writer_type=hash] (7) ShuffleQueryStage Output [3]: [c_custkey#X, c_phone#X, c_acctbal#X] @@ -103,7 +103,7 @@ Arguments: X, X (14) ColumnarExchange Input [2]: [hash_partition_key#X, o_custkey#X] -Arguments: hashpartitioning(o_custkey#X, 1), ENSURE_REQUIREMENTS, [o_custkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(o_custkey#X, 1), ENSURE_REQUIREMENTS, [o_custkey#X], [plan_id=X], [shuffle_writer_type=hash] (15) ShuffleQueryStage Output [1]: [o_custkey#X] @@ -145,7 +145,7 @@ Arguments: X, X (24) ColumnarExchange Input [5]: [hash_partition_key#X, cntrycode#X, count#X, sum#X, isEmpty#X] -Arguments: hashpartitioning(cntrycode#X, 1), ENSURE_REQUIREMENTS, [cntrycode#X, count#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(cntrycode#X, 1), ENSURE_REQUIREMENTS, [cntrycode#X, count#X, sum#X, isEmpty#X], [plan_id=X], [shuffle_writer_type=hash] (25) ShuffleQueryStage Output [4]: [cntrycode#X, count#X, sum#X, isEmpty#X] @@ -174,7 +174,7 @@ Arguments: X, X (31) ColumnarExchange Input [3]: [cntrycode#X, numcust#X, totacctbal#X] -Arguments: rangepartitioning(cntrycode#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] +Arguments: rangepartitioning(cntrycode#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [shuffle_writer_type=hash] (32) ShuffleQueryStage Output [3]: [cntrycode#X, numcust#X, totacctbal#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark32/3.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark32/3.txt index a3fb4062c83d0..b5d79b5819200 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark32/3.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark32/3.txt @@ -86,7 +86,7 @@ Arguments: X, X (6) ColumnarExchange Input [2]: [hash_partition_key#X, c_custkey#X] -Arguments: hashpartitioning(c_custkey#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(c_custkey#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X], [plan_id=X], [shuffle_writer_type=hash] (7) ShuffleQueryStage Output [1]: [c_custkey#X] @@ -123,7 +123,7 @@ Arguments: X, X (15) ColumnarExchange Input [5]: [hash_partition_key#X, o_orderkey#X, o_custkey#X, o_orderdate#X, o_shippriority#X] -Arguments: hashpartitioning(o_custkey#X, 1), ENSURE_REQUIREMENTS, [o_orderkey#X, o_custkey#X, o_orderdate#X, o_shippriority#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(o_custkey#X, 1), ENSURE_REQUIREMENTS, [o_orderkey#X, o_custkey#X, o_orderdate#X, o_shippriority#X], [plan_id=X], [shuffle_writer_type=hash] (16) ShuffleQueryStage Output [4]: [o_orderkey#X, o_custkey#X, o_orderdate#X, o_shippriority#X] @@ -154,7 +154,7 @@ Arguments: X, X (23) ColumnarExchange Input [4]: [hash_partition_key#X, o_orderkey#X, o_orderdate#X, o_shippriority#X] -Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [o_orderkey#X, o_orderdate#X, o_shippriority#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [o_orderkey#X, o_orderdate#X, o_shippriority#X], [plan_id=X], [shuffle_writer_type=hash] (24) ShuffleQueryStage Output [3]: [o_orderkey#X, o_orderdate#X, o_shippriority#X] @@ -191,7 +191,7 @@ Arguments: X, X (32) ColumnarExchange Input [4]: [hash_partition_key#X, l_orderkey#X, l_extendedprice#X, l_discount#X] -Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_extendedprice#X, l_discount#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_extendedprice#X, l_discount#X], [plan_id=X], [shuffle_writer_type=hash] (33) ShuffleQueryStage Output [3]: [l_orderkey#X, l_extendedprice#X, l_discount#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark32/4.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark32/4.txt index eae6f66bbb18b..7c07aa0134164 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark32/4.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark32/4.txt @@ -75,7 +75,7 @@ Arguments: X, X (6) ColumnarExchange Input [3]: [hash_partition_key#X, o_orderkey#X, o_orderpriority#X] -Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [o_orderkey#X, o_orderpriority#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [o_orderkey#X, o_orderpriority#X], [plan_id=X], [shuffle_writer_type=hash] (7) ShuffleQueryStage Output [2]: [o_orderkey#X, o_orderpriority#X] @@ -112,7 +112,7 @@ Arguments: X, X (15) ColumnarExchange Input [2]: [hash_partition_key#X, l_orderkey#X] -Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X], [plan_id=X], [shuffle_writer_type=hash] (16) ShuffleQueryStage Output [1]: [l_orderkey#X] @@ -154,7 +154,7 @@ Arguments: X, X (25) ColumnarExchange Input [3]: [hash_partition_key#X, o_orderpriority#X, count#X] -Arguments: hashpartitioning(o_orderpriority#X, 1), ENSURE_REQUIREMENTS, [o_orderpriority#X, count#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(o_orderpriority#X, 1), ENSURE_REQUIREMENTS, [o_orderpriority#X, count#X], [plan_id=X], [shuffle_writer_type=hash] (26) ShuffleQueryStage Output [2]: [o_orderpriority#X, count#X] @@ -183,7 +183,7 @@ Arguments: X, X (32) ColumnarExchange Input [2]: [o_orderpriority#X, order_count#X] -Arguments: rangepartitioning(o_orderpriority#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] +Arguments: rangepartitioning(o_orderpriority#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [shuffle_writer_type=hash] (33) ShuffleQueryStage Output [2]: [o_orderpriority#X, order_count#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark32/5.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark32/5.txt index 662616ac077a8..f481ec2d47efb 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark32/5.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark32/5.txt @@ -159,7 +159,7 @@ Arguments: X, X (6) ColumnarExchange Input [3]: [hash_partition_key#X, c_custkey#X, c_nationkey#X] -Arguments: hashpartitioning(c_custkey#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X, c_nationkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(c_custkey#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X, c_nationkey#X], [plan_id=X], [shuffle_writer_type=hash] (7) ShuffleQueryStage Output [2]: [c_custkey#X, c_nationkey#X] @@ -196,7 +196,7 @@ Arguments: X, X (15) ColumnarExchange Input [3]: [hash_partition_key#X, o_orderkey#X, o_custkey#X] -Arguments: hashpartitioning(o_custkey#X, 1), ENSURE_REQUIREMENTS, [o_orderkey#X, o_custkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(o_custkey#X, 1), ENSURE_REQUIREMENTS, [o_orderkey#X, o_custkey#X], [plan_id=X], [shuffle_writer_type=hash] (16) ShuffleQueryStage Output [2]: [o_orderkey#X, o_custkey#X] @@ -227,7 +227,7 @@ Arguments: X, X (23) ColumnarExchange Input [3]: [hash_partition_key#X, c_nationkey#X, o_orderkey#X] -Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [c_nationkey#X, o_orderkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [c_nationkey#X, o_orderkey#X], [plan_id=X], [shuffle_writer_type=hash] (24) ShuffleQueryStage Output [2]: [c_nationkey#X, o_orderkey#X] @@ -264,7 +264,7 @@ Arguments: X, X (32) ColumnarExchange Input [5]: [hash_partition_key#X, l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] -Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X], [plan_id=X], [shuffle_writer_type=hash] (33) ShuffleQueryStage Output [4]: [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] @@ -295,7 +295,7 @@ Arguments: X, X (40) ColumnarExchange Input [5]: [hash_partition_key#X, c_nationkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] -Arguments: hashpartitioning(l_suppkey#X, c_nationkey#X, 1), ENSURE_REQUIREMENTS, [c_nationkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(l_suppkey#X, c_nationkey#X, 1), ENSURE_REQUIREMENTS, [c_nationkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X], [plan_id=X], [shuffle_writer_type=hash] (41) ShuffleQueryStage Output [4]: [c_nationkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] @@ -332,7 +332,7 @@ Arguments: X, X (49) ColumnarExchange Input [3]: [hash_partition_key#X, s_suppkey#X, s_nationkey#X] -Arguments: hashpartitioning(s_suppkey#X, s_nationkey#X, 1), ENSURE_REQUIREMENTS, [s_suppkey#X, s_nationkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(s_suppkey#X, s_nationkey#X, 1), ENSURE_REQUIREMENTS, [s_suppkey#X, s_nationkey#X], [plan_id=X], [shuffle_writer_type=hash] (50) ShuffleQueryStage Output [2]: [s_suppkey#X, s_nationkey#X] @@ -363,7 +363,7 @@ Arguments: X, X (57) ColumnarExchange Input [4]: [hash_partition_key#X, l_extendedprice#X, l_discount#X, s_nationkey#X] -Arguments: hashpartitioning(s_nationkey#X, 1), ENSURE_REQUIREMENTS, [l_extendedprice#X, l_discount#X, s_nationkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(s_nationkey#X, 1), ENSURE_REQUIREMENTS, [l_extendedprice#X, l_discount#X, s_nationkey#X], [plan_id=X], [shuffle_writer_type=hash] (58) ShuffleQueryStage Output [3]: [l_extendedprice#X, l_discount#X, s_nationkey#X] @@ -400,7 +400,7 @@ Arguments: X, X (66) ColumnarExchange Input [4]: [hash_partition_key#X, n_nationkey#X, n_name#X, n_regionkey#X] -Arguments: hashpartitioning(n_nationkey#X, 1), ENSURE_REQUIREMENTS, [n_nationkey#X, n_name#X, n_regionkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(n_nationkey#X, 1), ENSURE_REQUIREMENTS, [n_nationkey#X, n_name#X, n_regionkey#X], [plan_id=X], [shuffle_writer_type=hash] (67) ShuffleQueryStage Output [3]: [n_nationkey#X, n_name#X, n_regionkey#X] @@ -431,7 +431,7 @@ Arguments: X, X (74) ColumnarExchange Input [5]: [hash_partition_key#X, l_extendedprice#X, l_discount#X, n_name#X, n_regionkey#X] -Arguments: hashpartitioning(n_regionkey#X, 1), ENSURE_REQUIREMENTS, [l_extendedprice#X, l_discount#X, n_name#X, n_regionkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(n_regionkey#X, 1), ENSURE_REQUIREMENTS, [l_extendedprice#X, l_discount#X, n_name#X, n_regionkey#X], [plan_id=X], [shuffle_writer_type=hash] (75) ShuffleQueryStage Output [4]: [l_extendedprice#X, l_discount#X, n_name#X, n_regionkey#X] @@ -468,7 +468,7 @@ Arguments: X, X (83) ColumnarExchange Input [2]: [hash_partition_key#X, r_regionkey#X] -Arguments: hashpartitioning(r_regionkey#X, 1), ENSURE_REQUIREMENTS, [r_regionkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(r_regionkey#X, 1), ENSURE_REQUIREMENTS, [r_regionkey#X], [plan_id=X], [shuffle_writer_type=hash] (84) ShuffleQueryStage Output [1]: [r_regionkey#X] @@ -510,7 +510,7 @@ Arguments: X, X (93) ColumnarExchange Input [4]: [hash_partition_key#X, n_name#X, sum#X, isEmpty#X] -Arguments: hashpartitioning(n_name#X, 1), ENSURE_REQUIREMENTS, [n_name#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(n_name#X, 1), ENSURE_REQUIREMENTS, [n_name#X, sum#X, isEmpty#X], [plan_id=X], [shuffle_writer_type=hash] (94) ShuffleQueryStage Output [3]: [n_name#X, sum#X, isEmpty#X] @@ -539,7 +539,7 @@ Arguments: X, X (100) ColumnarExchange Input [2]: [n_name#X, revenue#X] -Arguments: rangepartitioning(revenue#X DESC NULLS LAST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] +Arguments: rangepartitioning(revenue#X DESC NULLS LAST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [shuffle_writer_type=hash] (101) ShuffleQueryStage Output [2]: [n_name#X, revenue#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark32/6.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark32/6.txt index ee8c494f56a30..2b97fd28f1476 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark32/6.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark32/6.txt @@ -52,7 +52,7 @@ Arguments: X, X (7) ColumnarExchange Input [2]: [sum#X, isEmpty#X] -Arguments: SinglePartition, ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] +Arguments: SinglePartition, ENSURE_REQUIREMENTS, [plan_id=X], [shuffle_writer_type=hash] (8) ShuffleQueryStage Output [2]: [sum#X, isEmpty#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark32/7.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark32/7.txt index ef168ad7bb8ff..82d0d75e311f8 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark32/7.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark32/7.txt @@ -153,7 +153,7 @@ Arguments: X, X (6) ColumnarExchange Input [3]: [hash_partition_key#X, s_suppkey#X, s_nationkey#X] -Arguments: hashpartitioning(s_suppkey#X, 1), ENSURE_REQUIREMENTS, [s_suppkey#X, s_nationkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(s_suppkey#X, 1), ENSURE_REQUIREMENTS, [s_suppkey#X, s_nationkey#X], [plan_id=X], [shuffle_writer_type=hash] (7) ShuffleQueryStage Output [2]: [s_suppkey#X, s_nationkey#X] @@ -190,7 +190,7 @@ Arguments: X, X (15) ColumnarExchange Input [6]: [hash_partition_key#X, l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] -Arguments: hashpartitioning(l_suppkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(l_suppkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X], [plan_id=X], [shuffle_writer_type=hash] (16) ShuffleQueryStage Output [5]: [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] @@ -221,7 +221,7 @@ Arguments: X, X (23) ColumnarExchange Input [6]: [hash_partition_key#X, s_nationkey#X, l_orderkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] -Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [s_nationkey#X, l_orderkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [s_nationkey#X, l_orderkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X], [plan_id=X], [shuffle_writer_type=hash] (24) ShuffleQueryStage Output [5]: [s_nationkey#X, l_orderkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] @@ -258,7 +258,7 @@ Arguments: X, X (32) ColumnarExchange Input [3]: [hash_partition_key#X, o_orderkey#X, o_custkey#X] -Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [o_orderkey#X, o_custkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [o_orderkey#X, o_custkey#X], [plan_id=X], [shuffle_writer_type=hash] (33) ShuffleQueryStage Output [2]: [o_orderkey#X, o_custkey#X] @@ -289,7 +289,7 @@ Arguments: X, X (40) ColumnarExchange Input [6]: [hash_partition_key#X, s_nationkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, o_custkey#X] -Arguments: hashpartitioning(o_custkey#X, 1), ENSURE_REQUIREMENTS, [s_nationkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, o_custkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(o_custkey#X, 1), ENSURE_REQUIREMENTS, [s_nationkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, o_custkey#X], [plan_id=X], [shuffle_writer_type=hash] (41) ShuffleQueryStage Output [5]: [s_nationkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, o_custkey#X] @@ -326,7 +326,7 @@ Arguments: X, X (49) ColumnarExchange Input [3]: [hash_partition_key#X, c_custkey#X, c_nationkey#X] -Arguments: hashpartitioning(c_custkey#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X, c_nationkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(c_custkey#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X, c_nationkey#X], [plan_id=X], [shuffle_writer_type=hash] (50) ShuffleQueryStage Output [2]: [c_custkey#X, c_nationkey#X] @@ -357,7 +357,7 @@ Arguments: X, X (57) ColumnarExchange Input [6]: [hash_partition_key#X, s_nationkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, c_nationkey#X] -Arguments: hashpartitioning(s_nationkey#X, 1), ENSURE_REQUIREMENTS, [s_nationkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, c_nationkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(s_nationkey#X, 1), ENSURE_REQUIREMENTS, [s_nationkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, c_nationkey#X], [plan_id=X], [shuffle_writer_type=hash] (58) ShuffleQueryStage Output [5]: [s_nationkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, c_nationkey#X] @@ -394,7 +394,7 @@ Arguments: X, X (66) ColumnarExchange Input [3]: [hash_partition_key#X, n_nationkey#X, n_name#X] -Arguments: hashpartitioning(n_nationkey#X, 1), ENSURE_REQUIREMENTS, [n_nationkey#X, n_name#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(n_nationkey#X, 1), ENSURE_REQUIREMENTS, [n_nationkey#X, n_name#X], [plan_id=X], [shuffle_writer_type=hash] (67) ShuffleQueryStage Output [2]: [n_nationkey#X, n_name#X] @@ -425,7 +425,7 @@ Arguments: X, X (74) ColumnarExchange Input [6]: [hash_partition_key#X, l_extendedprice#X, l_discount#X, l_shipdate#X, c_nationkey#X, n_name#X] -Arguments: hashpartitioning(c_nationkey#X, 1), ENSURE_REQUIREMENTS, [l_extendedprice#X, l_discount#X, l_shipdate#X, c_nationkey#X, n_name#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(c_nationkey#X, 1), ENSURE_REQUIREMENTS, [l_extendedprice#X, l_discount#X, l_shipdate#X, c_nationkey#X, n_name#X], [plan_id=X], [shuffle_writer_type=hash] (75) ShuffleQueryStage Output [5]: [l_extendedprice#X, l_discount#X, l_shipdate#X, c_nationkey#X, n_name#X] @@ -480,7 +480,7 @@ Arguments: X, X (88) ColumnarExchange Input [6]: [hash_partition_key#X, supp_nation#X, cust_nation#X, l_year#X, sum#X, isEmpty#X] -Arguments: hashpartitioning(supp_nation#X, cust_nation#X, l_year#X, 1), ENSURE_REQUIREMENTS, [supp_nation#X, cust_nation#X, l_year#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(supp_nation#X, cust_nation#X, l_year#X, 1), ENSURE_REQUIREMENTS, [supp_nation#X, cust_nation#X, l_year#X, sum#X, isEmpty#X], [plan_id=X], [shuffle_writer_type=hash] (89) ShuffleQueryStage Output [5]: [supp_nation#X, cust_nation#X, l_year#X, sum#X, isEmpty#X] @@ -509,7 +509,7 @@ Arguments: X, X (95) ColumnarExchange Input [4]: [supp_nation#X, cust_nation#X, l_year#X, revenue#X] -Arguments: rangepartitioning(supp_nation#X ASC NULLS FIRST, cust_nation#X ASC NULLS FIRST, l_year#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] +Arguments: rangepartitioning(supp_nation#X ASC NULLS FIRST, cust_nation#X ASC NULLS FIRST, l_year#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [shuffle_writer_type=hash] (96) ShuffleQueryStage Output [4]: [supp_nation#X, cust_nation#X, l_year#X, revenue#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark32/8.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark32/8.txt index 55e4f19764d1d..6a7d97a628f8f 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark32/8.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark32/8.txt @@ -202,7 +202,7 @@ Arguments: X, X (6) ColumnarExchange Input [2]: [hash_partition_key#X, p_partkey#X] -Arguments: hashpartitioning(p_partkey#X, 1), ENSURE_REQUIREMENTS, [p_partkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(p_partkey#X, 1), ENSURE_REQUIREMENTS, [p_partkey#X], [plan_id=X], [shuffle_writer_type=hash] (7) ShuffleQueryStage Output [1]: [p_partkey#X] @@ -239,7 +239,7 @@ Arguments: X, X (15) ColumnarExchange Input [6]: [hash_partition_key#X, l_orderkey#X, l_partkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] -Arguments: hashpartitioning(l_partkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_partkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(l_partkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_partkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X], [plan_id=X], [shuffle_writer_type=hash] (16) ShuffleQueryStage Output [5]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] @@ -270,7 +270,7 @@ Arguments: X, X (23) ColumnarExchange Input [5]: [hash_partition_key#X, l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] -Arguments: hashpartitioning(l_suppkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(l_suppkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X], [plan_id=X], [shuffle_writer_type=hash] (24) ShuffleQueryStage Output [4]: [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] @@ -307,7 +307,7 @@ Arguments: X, X (32) ColumnarExchange Input [3]: [hash_partition_key#X, s_suppkey#X, s_nationkey#X] -Arguments: hashpartitioning(s_suppkey#X, 1), ENSURE_REQUIREMENTS, [s_suppkey#X, s_nationkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(s_suppkey#X, 1), ENSURE_REQUIREMENTS, [s_suppkey#X, s_nationkey#X], [plan_id=X], [shuffle_writer_type=hash] (33) ShuffleQueryStage Output [2]: [s_suppkey#X, s_nationkey#X] @@ -338,7 +338,7 @@ Arguments: X, X (40) ColumnarExchange Input [5]: [hash_partition_key#X, l_orderkey#X, l_extendedprice#X, l_discount#X, s_nationkey#X] -Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_extendedprice#X, l_discount#X, s_nationkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_extendedprice#X, l_discount#X, s_nationkey#X], [plan_id=X], [shuffle_writer_type=hash] (41) ShuffleQueryStage Output [4]: [l_orderkey#X, l_extendedprice#X, l_discount#X, s_nationkey#X] @@ -375,7 +375,7 @@ Arguments: X, X (49) ColumnarExchange Input [4]: [hash_partition_key#X, o_orderkey#X, o_custkey#X, o_orderdate#X] -Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [o_orderkey#X, o_custkey#X, o_orderdate#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [o_orderkey#X, o_custkey#X, o_orderdate#X], [plan_id=X], [shuffle_writer_type=hash] (50) ShuffleQueryStage Output [3]: [o_orderkey#X, o_custkey#X, o_orderdate#X] @@ -406,7 +406,7 @@ Arguments: X, X (57) ColumnarExchange Input [6]: [hash_partition_key#X, l_extendedprice#X, l_discount#X, s_nationkey#X, o_custkey#X, o_orderdate#X] -Arguments: hashpartitioning(o_custkey#X, 1), ENSURE_REQUIREMENTS, [l_extendedprice#X, l_discount#X, s_nationkey#X, o_custkey#X, o_orderdate#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(o_custkey#X, 1), ENSURE_REQUIREMENTS, [l_extendedprice#X, l_discount#X, s_nationkey#X, o_custkey#X, o_orderdate#X], [plan_id=X], [shuffle_writer_type=hash] (58) ShuffleQueryStage Output [5]: [l_extendedprice#X, l_discount#X, s_nationkey#X, o_custkey#X, o_orderdate#X] @@ -443,7 +443,7 @@ Arguments: X, X (66) ColumnarExchange Input [3]: [hash_partition_key#X, c_custkey#X, c_nationkey#X] -Arguments: hashpartitioning(c_custkey#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X, c_nationkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(c_custkey#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X, c_nationkey#X], [plan_id=X], [shuffle_writer_type=hash] (67) ShuffleQueryStage Output [2]: [c_custkey#X, c_nationkey#X] @@ -474,7 +474,7 @@ Arguments: X, X (74) ColumnarExchange Input [6]: [hash_partition_key#X, l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderdate#X, c_nationkey#X] -Arguments: hashpartitioning(c_nationkey#X, 1), ENSURE_REQUIREMENTS, [l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderdate#X, c_nationkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(c_nationkey#X, 1), ENSURE_REQUIREMENTS, [l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderdate#X, c_nationkey#X], [plan_id=X], [shuffle_writer_type=hash] (75) ShuffleQueryStage Output [5]: [l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderdate#X, c_nationkey#X] @@ -511,7 +511,7 @@ Arguments: X, X (83) ColumnarExchange Input [3]: [hash_partition_key#X, n_nationkey#X, n_regionkey#X] -Arguments: hashpartitioning(n_nationkey#X, 1), ENSURE_REQUIREMENTS, [n_nationkey#X, n_regionkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(n_nationkey#X, 1), ENSURE_REQUIREMENTS, [n_nationkey#X, n_regionkey#X], [plan_id=X], [shuffle_writer_type=hash] (84) ShuffleQueryStage Output [2]: [n_nationkey#X, n_regionkey#X] @@ -542,7 +542,7 @@ Arguments: X, X (91) ColumnarExchange Input [6]: [hash_partition_key#X, l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderdate#X, n_regionkey#X] -Arguments: hashpartitioning(s_nationkey#X, 1), ENSURE_REQUIREMENTS, [l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderdate#X, n_regionkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(s_nationkey#X, 1), ENSURE_REQUIREMENTS, [l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderdate#X, n_regionkey#X], [plan_id=X], [shuffle_writer_type=hash] (92) ShuffleQueryStage Output [5]: [l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderdate#X, n_regionkey#X] @@ -579,7 +579,7 @@ Arguments: X, X (100) ColumnarExchange Input [3]: [hash_partition_key#X, n_nationkey#X, n_name#X] -Arguments: hashpartitioning(n_nationkey#X, 1), ENSURE_REQUIREMENTS, [n_nationkey#X, n_name#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(n_nationkey#X, 1), ENSURE_REQUIREMENTS, [n_nationkey#X, n_name#X], [plan_id=X], [shuffle_writer_type=hash] (101) ShuffleQueryStage Output [2]: [n_nationkey#X, n_name#X] @@ -610,7 +610,7 @@ Arguments: X, X (108) ColumnarExchange Input [6]: [hash_partition_key#X, l_extendedprice#X, l_discount#X, o_orderdate#X, n_regionkey#X, n_name#X] -Arguments: hashpartitioning(n_regionkey#X, 1), ENSURE_REQUIREMENTS, [l_extendedprice#X, l_discount#X, o_orderdate#X, n_regionkey#X, n_name#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(n_regionkey#X, 1), ENSURE_REQUIREMENTS, [l_extendedprice#X, l_discount#X, o_orderdate#X, n_regionkey#X, n_name#X], [plan_id=X], [shuffle_writer_type=hash] (109) ShuffleQueryStage Output [5]: [l_extendedprice#X, l_discount#X, o_orderdate#X, n_regionkey#X, n_name#X] @@ -647,7 +647,7 @@ Arguments: X, X (117) ColumnarExchange Input [2]: [hash_partition_key#X, r_regionkey#X] -Arguments: hashpartitioning(r_regionkey#X, 1), ENSURE_REQUIREMENTS, [r_regionkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(r_regionkey#X, 1), ENSURE_REQUIREMENTS, [r_regionkey#X], [plan_id=X], [shuffle_writer_type=hash] (118) ShuffleQueryStage Output [1]: [r_regionkey#X] @@ -689,7 +689,7 @@ Arguments: X, X (127) ColumnarExchange Input [6]: [hash_partition_key#X, o_year#X, sum#X, isEmpty#X, sum#X, isEmpty#X] -Arguments: hashpartitioning(o_year#X, 1), ENSURE_REQUIREMENTS, [o_year#X, sum#X, isEmpty#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(o_year#X, 1), ENSURE_REQUIREMENTS, [o_year#X, sum#X, isEmpty#X, sum#X, isEmpty#X], [plan_id=X], [shuffle_writer_type=hash] (128) ShuffleQueryStage Output [5]: [o_year#X, sum#X, isEmpty#X, sum#X, isEmpty#X] @@ -722,7 +722,7 @@ Arguments: X, X (135) ColumnarExchange Input [2]: [o_year#X, mkt_share#X] -Arguments: rangepartitioning(o_year#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] +Arguments: rangepartitioning(o_year#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [shuffle_writer_type=hash] (136) ShuffleQueryStage Output [2]: [o_year#X, mkt_share#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark32/9.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark32/9.txt index f07b5ce81c144..a0f1d9c68a436 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark32/9.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark32/9.txt @@ -158,7 +158,7 @@ Arguments: X, X (6) ColumnarExchange Input [2]: [hash_partition_key#X, p_partkey#X] -Arguments: hashpartitioning(p_partkey#X, 1), ENSURE_REQUIREMENTS, [p_partkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(p_partkey#X, 1), ENSURE_REQUIREMENTS, [p_partkey#X], [plan_id=X], [shuffle_writer_type=hash] (7) ShuffleQueryStage Output [1]: [p_partkey#X] @@ -195,7 +195,7 @@ Arguments: X, X (15) ColumnarExchange Input [7]: [hash_partition_key#X, l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] -Arguments: hashpartitioning(l_partkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(l_partkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X], [plan_id=X], [shuffle_writer_type=hash] (16) ShuffleQueryStage Output [6]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] @@ -226,7 +226,7 @@ Arguments: X, X (23) ColumnarExchange Input [7]: [hash_partition_key#X, l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] -Arguments: hashpartitioning(l_suppkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(l_suppkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X], [plan_id=X], [shuffle_writer_type=hash] (24) ShuffleQueryStage Output [6]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] @@ -263,7 +263,7 @@ Arguments: X, X (32) ColumnarExchange Input [3]: [hash_partition_key#X, s_suppkey#X, s_nationkey#X] -Arguments: hashpartitioning(s_suppkey#X, 1), ENSURE_REQUIREMENTS, [s_suppkey#X, s_nationkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(s_suppkey#X, 1), ENSURE_REQUIREMENTS, [s_suppkey#X, s_nationkey#X], [plan_id=X], [shuffle_writer_type=hash] (33) ShuffleQueryStage Output [2]: [s_suppkey#X, s_nationkey#X] @@ -294,7 +294,7 @@ Arguments: X, X (40) ColumnarExchange Input [8]: [hash_partition_key#X, l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X] -Arguments: hashpartitioning(l_suppkey#X, l_partkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(l_suppkey#X, l_partkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X], [plan_id=X], [shuffle_writer_type=hash] (41) ShuffleQueryStage Output [7]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X] @@ -331,7 +331,7 @@ Arguments: X, X (49) ColumnarExchange Input [4]: [hash_partition_key#X, ps_partkey#X, ps_suppkey#X, ps_supplycost#X] -Arguments: hashpartitioning(ps_suppkey#X, ps_partkey#X, 1), ENSURE_REQUIREMENTS, [ps_partkey#X, ps_suppkey#X, ps_supplycost#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(ps_suppkey#X, ps_partkey#X, 1), ENSURE_REQUIREMENTS, [ps_partkey#X, ps_suppkey#X, ps_supplycost#X], [plan_id=X], [shuffle_writer_type=hash] (50) ShuffleQueryStage Output [3]: [ps_partkey#X, ps_suppkey#X, ps_supplycost#X] @@ -362,7 +362,7 @@ Arguments: X, X (57) ColumnarExchange Input [7]: [hash_partition_key#X, l_orderkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_supplycost#X] -Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_supplycost#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_supplycost#X], [plan_id=X], [shuffle_writer_type=hash] (58) ShuffleQueryStage Output [6]: [l_orderkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_supplycost#X] @@ -399,7 +399,7 @@ Arguments: X, X (66) ColumnarExchange Input [3]: [hash_partition_key#X, o_orderkey#X, o_orderdate#X] -Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [o_orderkey#X, o_orderdate#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [o_orderkey#X, o_orderdate#X], [plan_id=X], [shuffle_writer_type=hash] (67) ShuffleQueryStage Output [2]: [o_orderkey#X, o_orderdate#X] @@ -430,7 +430,7 @@ Arguments: X, X (74) ColumnarExchange Input [7]: [hash_partition_key#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_supplycost#X, o_orderdate#X] -Arguments: hashpartitioning(s_nationkey#X, 1), ENSURE_REQUIREMENTS, [l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_supplycost#X, o_orderdate#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(s_nationkey#X, 1), ENSURE_REQUIREMENTS, [l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_supplycost#X, o_orderdate#X], [plan_id=X], [shuffle_writer_type=hash] (75) ShuffleQueryStage Output [6]: [l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_supplycost#X, o_orderdate#X] @@ -467,7 +467,7 @@ Arguments: X, X (83) ColumnarExchange Input [3]: [hash_partition_key#X, n_nationkey#X, n_name#X] -Arguments: hashpartitioning(n_nationkey#X, 1), ENSURE_REQUIREMENTS, [n_nationkey#X, n_name#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(n_nationkey#X, 1), ENSURE_REQUIREMENTS, [n_nationkey#X, n_name#X], [plan_id=X], [shuffle_writer_type=hash] (84) ShuffleQueryStage Output [2]: [n_nationkey#X, n_name#X] @@ -509,7 +509,7 @@ Arguments: X, X (93) ColumnarExchange Input [5]: [hash_partition_key#X, nation#X, o_year#X, sum#X, isEmpty#X] -Arguments: hashpartitioning(nation#X, o_year#X, 1), ENSURE_REQUIREMENTS, [nation#X, o_year#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(nation#X, o_year#X, 1), ENSURE_REQUIREMENTS, [nation#X, o_year#X, sum#X, isEmpty#X], [plan_id=X], [shuffle_writer_type=hash] (94) ShuffleQueryStage Output [4]: [nation#X, o_year#X, sum#X, isEmpty#X] @@ -538,7 +538,7 @@ Arguments: X, X (100) ColumnarExchange Input [3]: [nation#X, o_year#X, sum_profit#X] -Arguments: rangepartitioning(nation#X ASC NULLS FIRST, o_year#X DESC NULLS LAST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] +Arguments: rangepartitioning(nation#X ASC NULLS FIRST, o_year#X DESC NULLS LAST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [shuffle_writer_type=hash] (101) ShuffleQueryStage Output [3]: [nation#X, o_year#X, sum_profit#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark33/1.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark33/1.txt index 12c19c45e38d1..417ddf9de25d0 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark33/1.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark33/1.txt @@ -64,7 +64,7 @@ Arguments: X, X (8) ColumnarExchange Input [18]: [hash_partition_key#X, l_returnflag#X, l_linestatus#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, count#X, sum#X, count#X, sum#X, count#X, count#X] -Arguments: hashpartitioning(l_returnflag#X, l_linestatus#X, 1), ENSURE_REQUIREMENTS, [l_returnflag#X, l_linestatus#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, count#X, sum#X, count#X, sum#X, count#X, count#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(l_returnflag#X, l_linestatus#X, 1), ENSURE_REQUIREMENTS, [l_returnflag#X, l_linestatus#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, count#X, sum#X, count#X, sum#X, count#X, count#X], [plan_id=X], [shuffle_writer_type=hash] (9) ShuffleQueryStage Output [17]: [l_returnflag#X, l_linestatus#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, count#X, sum#X, count#X, sum#X, count#X, count#X] @@ -93,7 +93,7 @@ Arguments: X, X (15) ColumnarExchange Input [10]: [l_returnflag#X, l_linestatus#X, sum_qty#X, sum_base_price#X, sum_disc_price#X, sum_charge#X, avg_qty#X, avg_price#X, avg_disc#X, count_order#X] -Arguments: rangepartitioning(l_returnflag#X ASC NULLS FIRST, l_linestatus#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] +Arguments: rangepartitioning(l_returnflag#X ASC NULLS FIRST, l_linestatus#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [shuffle_writer_type=hash] (16) ShuffleQueryStage Output [10]: [l_returnflag#X, l_linestatus#X, sum_qty#X, sum_base_price#X, sum_disc_price#X, sum_charge#X, avg_qty#X, avg_price#X, avg_disc#X, count_order#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark33/10.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark33/10.txt index d26861c8bcb37..e42af1ec4cdc5 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark33/10.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark33/10.txt @@ -113,7 +113,7 @@ Arguments: X, X (6) ColumnarExchange Input [8]: [hash_partition_key#X, c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X] -Arguments: hashpartitioning(c_custkey#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(c_custkey#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X], [plan_id=X], [shuffle_writer_type=hash] (7) ShuffleQueryStage Output [7]: [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X] @@ -150,7 +150,7 @@ Arguments: X, X (15) ColumnarExchange Input [3]: [hash_partition_key#X, o_orderkey#X, o_custkey#X] -Arguments: hashpartitioning(o_custkey#X, 1), ENSURE_REQUIREMENTS, [o_orderkey#X, o_custkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(o_custkey#X, 1), ENSURE_REQUIREMENTS, [o_orderkey#X, o_custkey#X], [plan_id=X], [shuffle_writer_type=hash] (16) ShuffleQueryStage Output [2]: [o_orderkey#X, o_custkey#X] @@ -181,7 +181,7 @@ Arguments: X, X (23) ColumnarExchange Input [9]: [hash_partition_key#X, c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, o_orderkey#X] -Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, o_orderkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, o_orderkey#X], [plan_id=X], [shuffle_writer_type=hash] (24) ShuffleQueryStage Output [8]: [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, o_orderkey#X] @@ -218,7 +218,7 @@ Arguments: X, X (32) ColumnarExchange Input [4]: [hash_partition_key#X, l_orderkey#X, l_extendedprice#X, l_discount#X] -Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_extendedprice#X, l_discount#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_extendedprice#X, l_discount#X], [plan_id=X], [shuffle_writer_type=hash] (33) ShuffleQueryStage Output [3]: [l_orderkey#X, l_extendedprice#X, l_discount#X] @@ -249,7 +249,7 @@ Arguments: X, X (40) ColumnarExchange Input [10]: [hash_partition_key#X, c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, l_extendedprice#X, l_discount#X] -Arguments: hashpartitioning(c_nationkey#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, l_extendedprice#X, l_discount#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(c_nationkey#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, l_extendedprice#X, l_discount#X], [plan_id=X], [shuffle_writer_type=hash] (41) ShuffleQueryStage Output [9]: [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, l_extendedprice#X, l_discount#X] @@ -286,7 +286,7 @@ Arguments: X, X (49) ColumnarExchange Input [3]: [hash_partition_key#X, n_nationkey#X, n_name#X] -Arguments: hashpartitioning(n_nationkey#X, 1), ENSURE_REQUIREMENTS, [n_nationkey#X, n_name#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(n_nationkey#X, 1), ENSURE_REQUIREMENTS, [n_nationkey#X, n_name#X], [plan_id=X], [shuffle_writer_type=hash] (50) ShuffleQueryStage Output [2]: [n_nationkey#X, n_name#X] @@ -328,7 +328,7 @@ Arguments: X, X (59) ColumnarExchange Input [10]: [hash_partition_key#X, c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, sum#X, isEmpty#X] -Arguments: hashpartitioning(c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, sum#X, isEmpty#X], [plan_id=X], [shuffle_writer_type=hash] (60) ShuffleQueryStage Output [9]: [c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, sum#X, isEmpty#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark33/11.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark33/11.txt index 5dee0fa9091b5..fe2dcc9190df9 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark33/11.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark33/11.txt @@ -97,7 +97,7 @@ Arguments: X, X (6) ColumnarExchange Input [5]: [hash_partition_key#X, ps_partkey#X, ps_suppkey#X, ps_availqty#X, ps_supplycost#X] -Arguments: hashpartitioning(ps_suppkey#X, 1), ENSURE_REQUIREMENTS, [ps_partkey#X, ps_suppkey#X, ps_availqty#X, ps_supplycost#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(ps_suppkey#X, 1), ENSURE_REQUIREMENTS, [ps_partkey#X, ps_suppkey#X, ps_availqty#X, ps_supplycost#X], [plan_id=X], [shuffle_writer_type=hash] (7) ShuffleQueryStage Output [4]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X, ps_supplycost#X] @@ -134,7 +134,7 @@ Arguments: X, X (15) ColumnarExchange Input [3]: [hash_partition_key#X, s_suppkey#X, s_nationkey#X] -Arguments: hashpartitioning(s_suppkey#X, 1), ENSURE_REQUIREMENTS, [s_suppkey#X, s_nationkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(s_suppkey#X, 1), ENSURE_REQUIREMENTS, [s_suppkey#X, s_nationkey#X], [plan_id=X], [shuffle_writer_type=hash] (16) ShuffleQueryStage Output [2]: [s_suppkey#X, s_nationkey#X] @@ -165,7 +165,7 @@ Arguments: X, X (23) ColumnarExchange Input [5]: [hash_partition_key#X, ps_partkey#X, ps_availqty#X, ps_supplycost#X, s_nationkey#X] -Arguments: hashpartitioning(s_nationkey#X, 1), ENSURE_REQUIREMENTS, [ps_partkey#X, ps_availqty#X, ps_supplycost#X, s_nationkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(s_nationkey#X, 1), ENSURE_REQUIREMENTS, [ps_partkey#X, ps_availqty#X, ps_supplycost#X, s_nationkey#X], [plan_id=X], [shuffle_writer_type=hash] (24) ShuffleQueryStage Output [4]: [ps_partkey#X, ps_availqty#X, ps_supplycost#X, s_nationkey#X] @@ -202,7 +202,7 @@ Arguments: X, X (32) ColumnarExchange Input [2]: [hash_partition_key#X, n_nationkey#X] -Arguments: hashpartitioning(n_nationkey#X, 1), ENSURE_REQUIREMENTS, [n_nationkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(n_nationkey#X, 1), ENSURE_REQUIREMENTS, [n_nationkey#X], [plan_id=X], [shuffle_writer_type=hash] (33) ShuffleQueryStage Output [1]: [n_nationkey#X] @@ -244,7 +244,7 @@ Arguments: X, X (42) ColumnarExchange Input [4]: [hash_partition_key#X, ps_partkey#X, sum#X, isEmpty#X] -Arguments: hashpartitioning(ps_partkey#X, 1), ENSURE_REQUIREMENTS, [ps_partkey#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(ps_partkey#X, 1), ENSURE_REQUIREMENTS, [ps_partkey#X, sum#X, isEmpty#X], [plan_id=X], [shuffle_writer_type=hash] (43) ShuffleQueryStage Output [3]: [ps_partkey#X, sum#X, isEmpty#X] @@ -277,7 +277,7 @@ Arguments: X, X (50) ColumnarExchange Input [2]: [ps_partkey#X, value#X] -Arguments: rangepartitioning(value#X DESC NULLS LAST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] +Arguments: rangepartitioning(value#X DESC NULLS LAST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [shuffle_writer_type=hash] (51) ShuffleQueryStage Output [2]: [ps_partkey#X, value#X] @@ -500,7 +500,7 @@ Arguments: X, X (88) ColumnarExchange Input [4]: [hash_partition_key#X, ps_suppkey#X, ps_availqty#X, ps_supplycost#X] -Arguments: hashpartitioning(ps_suppkey#X, 1), ENSURE_REQUIREMENTS, [ps_suppkey#X, ps_availqty#X, ps_supplycost#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(ps_suppkey#X, 1), ENSURE_REQUIREMENTS, [ps_suppkey#X, ps_availqty#X, ps_supplycost#X], [plan_id=X], [shuffle_writer_type=hash] (89) ShuffleQueryStage Output [3]: [ps_suppkey#X, ps_availqty#X, ps_supplycost#X] @@ -544,7 +544,7 @@ Arguments: X, X (100) ColumnarExchange Input [4]: [hash_partition_key#X, ps_availqty#X, ps_supplycost#X, s_nationkey#X] -Arguments: hashpartitioning(s_nationkey#X, 1), ENSURE_REQUIREMENTS, [ps_availqty#X, ps_supplycost#X, s_nationkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(s_nationkey#X, 1), ENSURE_REQUIREMENTS, [ps_availqty#X, ps_supplycost#X, s_nationkey#X], [plan_id=X], [shuffle_writer_type=hash] (101) ShuffleQueryStage Output [3]: [ps_availqty#X, ps_supplycost#X, s_nationkey#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark33/12.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark33/12.txt index 4545acbc282aa..2ee35e463cb3d 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark33/12.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark33/12.txt @@ -74,7 +74,7 @@ Arguments: X, X (6) ColumnarExchange Input [3]: [hash_partition_key#X, o_orderkey#X, o_orderpriority#X] -Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [o_orderkey#X, o_orderpriority#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [o_orderkey#X, o_orderpriority#X], [plan_id=X], [shuffle_writer_type=hash] (7) ShuffleQueryStage Output [2]: [o_orderkey#X, o_orderpriority#X] @@ -111,7 +111,7 @@ Arguments: X, X (15) ColumnarExchange Input [3]: [hash_partition_key#X, l_orderkey#X, l_shipmode#X] -Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_shipmode#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_shipmode#X], [plan_id=X], [shuffle_writer_type=hash] (16) ShuffleQueryStage Output [2]: [l_orderkey#X, l_shipmode#X] @@ -153,7 +153,7 @@ Arguments: X, X (25) ColumnarExchange Input [4]: [hash_partition_key#X, l_shipmode#X, sum#X, sum#X] -Arguments: hashpartitioning(l_shipmode#X, 1), ENSURE_REQUIREMENTS, [l_shipmode#X, sum#X, sum#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(l_shipmode#X, 1), ENSURE_REQUIREMENTS, [l_shipmode#X, sum#X, sum#X], [plan_id=X], [shuffle_writer_type=hash] (26) ShuffleQueryStage Output [3]: [l_shipmode#X, sum#X, sum#X] @@ -182,7 +182,7 @@ Arguments: X, X (32) ColumnarExchange Input [3]: [l_shipmode#X, high_line_count#X, low_line_count#X] -Arguments: rangepartitioning(l_shipmode#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] +Arguments: rangepartitioning(l_shipmode#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [shuffle_writer_type=hash] (33) ShuffleQueryStage Output [3]: [l_shipmode#X, high_line_count#X, low_line_count#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark33/13.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark33/13.txt index 758ece2d8fce7..d59a69766201e 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark33/13.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark33/13.txt @@ -72,7 +72,7 @@ Arguments: X, X (5) ColumnarExchange Input [2]: [hash_partition_key#X, c_custkey#X] -Arguments: hashpartitioning(c_custkey#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(c_custkey#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X], [plan_id=X], [shuffle_writer_type=hash] (6) ShuffleQueryStage Output [1]: [c_custkey#X] @@ -109,7 +109,7 @@ Arguments: X, X (14) ColumnarExchange Input [3]: [hash_partition_key#X, o_orderkey#X, o_custkey#X] -Arguments: hashpartitioning(o_custkey#X, 1), ENSURE_REQUIREMENTS, [o_orderkey#X, o_custkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(o_custkey#X, 1), ENSURE_REQUIREMENTS, [o_orderkey#X, o_custkey#X], [plan_id=X], [shuffle_writer_type=hash] (15) ShuffleQueryStage Output [2]: [o_orderkey#X, o_custkey#X] @@ -169,7 +169,7 @@ Arguments: X, X (27) ColumnarExchange Input [3]: [hash_partition_key#X, c_count#X, count#X] -Arguments: hashpartitioning(c_count#X, 1), ENSURE_REQUIREMENTS, [c_count#X, count#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(c_count#X, 1), ENSURE_REQUIREMENTS, [c_count#X, count#X], [plan_id=X], [shuffle_writer_type=hash] (28) ShuffleQueryStage Output [2]: [c_count#X, count#X] @@ -198,7 +198,7 @@ Arguments: X, X (34) ColumnarExchange Input [2]: [c_count#X, custdist#X] -Arguments: rangepartitioning(custdist#X DESC NULLS LAST, c_count#X DESC NULLS LAST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] +Arguments: rangepartitioning(custdist#X DESC NULLS LAST, c_count#X DESC NULLS LAST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [shuffle_writer_type=hash] (35) ShuffleQueryStage Output [2]: [c_count#X, custdist#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark33/14.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark33/14.txt index ce943bd043e02..e570d1a4cfb0c 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark33/14.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark33/14.txt @@ -62,7 +62,7 @@ Arguments: X, X (6) ColumnarExchange Input [4]: [hash_partition_key#X, l_partkey#X, l_extendedprice#X, l_discount#X] -Arguments: hashpartitioning(l_partkey#X, 1), ENSURE_REQUIREMENTS, [l_partkey#X, l_extendedprice#X, l_discount#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(l_partkey#X, 1), ENSURE_REQUIREMENTS, [l_partkey#X, l_extendedprice#X, l_discount#X], [plan_id=X], [shuffle_writer_type=hash] (7) ShuffleQueryStage Output [3]: [l_partkey#X, l_extendedprice#X, l_discount#X] @@ -99,7 +99,7 @@ Arguments: X, X (15) ColumnarExchange Input [3]: [hash_partition_key#X, p_partkey#X, p_type#X] -Arguments: hashpartitioning(p_partkey#X, 1), ENSURE_REQUIREMENTS, [p_partkey#X, p_type#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(p_partkey#X, 1), ENSURE_REQUIREMENTS, [p_partkey#X, p_type#X], [plan_id=X], [shuffle_writer_type=hash] (16) ShuffleQueryStage Output [2]: [p_partkey#X, p_type#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark33/15.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark33/15.txt index 62b0047bcc06c..51baae5186faf 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark33/15.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark33/15.txt @@ -70,7 +70,7 @@ Arguments: X, X (6) ColumnarExchange Input [5]: [hash_partition_key#X, s_suppkey#X, s_name#X, s_address#X, s_phone#X] -Arguments: hashpartitioning(s_suppkey#X, 1), ENSURE_REQUIREMENTS, [s_suppkey#X, s_name#X, s_address#X, s_phone#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(s_suppkey#X, 1), ENSURE_REQUIREMENTS, [s_suppkey#X, s_name#X, s_address#X, s_phone#X], [plan_id=X], [shuffle_writer_type=hash] (7) ShuffleQueryStage Output [4]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X] @@ -118,7 +118,7 @@ Arguments: X, X (17) ColumnarExchange Input [4]: [hash_partition_key#X, l_suppkey#X, sum#X, isEmpty#X] -Arguments: hashpartitioning(l_suppkey#X, 1), ENSURE_REQUIREMENTS, [l_suppkey#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(l_suppkey#X, 1), ENSURE_REQUIREMENTS, [l_suppkey#X, sum#X, isEmpty#X], [plan_id=X], [shuffle_writer_type=hash] (18) ShuffleQueryStage Output [3]: [l_suppkey#X, sum#X, isEmpty#X] @@ -160,7 +160,7 @@ Arguments: X, X (27) ColumnarExchange Input [5]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X, total_revenue#X] -Arguments: rangepartitioning(s_suppkey#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] +Arguments: rangepartitioning(s_suppkey#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [shuffle_writer_type=hash] (28) ShuffleQueryStage Output [5]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X, total_revenue#X] @@ -320,7 +320,7 @@ Arguments: X, X (55) ColumnarExchange Input [4]: [hash_partition_key#X, l_suppkey#X, sum#X, isEmpty#X] -Arguments: hashpartitioning(l_suppkey#X, 1), ENSURE_REQUIREMENTS, [l_suppkey#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(l_suppkey#X, 1), ENSURE_REQUIREMENTS, [l_suppkey#X, sum#X, isEmpty#X], [plan_id=X], [shuffle_writer_type=hash] (56) ShuffleQueryStage Output [3]: [l_suppkey#X, sum#X, isEmpty#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark33/16.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark33/16.txt index 573dfb3514f58..ab417c50bc7bb 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark33/16.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark33/16.txt @@ -88,7 +88,7 @@ Arguments: X, X (6) ColumnarExchange Input [3]: [hash_partition_key#X, ps_partkey#X, ps_suppkey#X] -Arguments: hashpartitioning(ps_partkey#X, 1), ENSURE_REQUIREMENTS, [ps_partkey#X, ps_suppkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(ps_partkey#X, 1), ENSURE_REQUIREMENTS, [ps_partkey#X, ps_suppkey#X], [plan_id=X], [shuffle_writer_type=hash] (7) ShuffleQueryStage Output [2]: [ps_partkey#X, ps_suppkey#X] @@ -125,7 +125,7 @@ Arguments: X, X (15) ColumnarExchange Input [5]: [hash_partition_key#X, p_partkey#X, p_brand#X, p_type#X, p_size#X] -Arguments: hashpartitioning(p_partkey#X, 1), ENSURE_REQUIREMENTS, [p_partkey#X, p_brand#X, p_type#X, p_size#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(p_partkey#X, 1), ENSURE_REQUIREMENTS, [p_partkey#X, p_brand#X, p_type#X, p_size#X], [plan_id=X], [shuffle_writer_type=hash] (16) ShuffleQueryStage Output [4]: [p_partkey#X, p_brand#X, p_type#X, p_size#X] @@ -167,7 +167,7 @@ Arguments: X, X (25) ColumnarExchange Input [5]: [hash_partition_key#X, p_brand#X, p_type#X, p_size#X, ps_suppkey#X] -Arguments: hashpartitioning(p_brand#X, p_type#X, p_size#X, ps_suppkey#X, 1), ENSURE_REQUIREMENTS, [p_brand#X, p_type#X, p_size#X, ps_suppkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(p_brand#X, p_type#X, p_size#X, ps_suppkey#X, 1), ENSURE_REQUIREMENTS, [p_brand#X, p_type#X, p_size#X, ps_suppkey#X], [plan_id=X], [shuffle_writer_type=hash] (26) ShuffleQueryStage Output [4]: [p_brand#X, p_type#X, p_size#X, ps_suppkey#X] @@ -207,7 +207,7 @@ Arguments: X, X (34) ColumnarExchange Input [5]: [hash_partition_key#X, p_brand#X, p_type#X, p_size#X, count#X] -Arguments: hashpartitioning(p_brand#X, p_type#X, p_size#X, 1), ENSURE_REQUIREMENTS, [p_brand#X, p_type#X, p_size#X, count#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(p_brand#X, p_type#X, p_size#X, 1), ENSURE_REQUIREMENTS, [p_brand#X, p_type#X, p_size#X, count#X], [plan_id=X], [shuffle_writer_type=hash] (35) ShuffleQueryStage Output [4]: [p_brand#X, p_type#X, p_size#X, count#X] @@ -236,7 +236,7 @@ Arguments: X, X (41) ColumnarExchange Input [4]: [p_brand#X, p_type#X, p_size#X, supplier_cnt#X] -Arguments: rangepartitioning(supplier_cnt#X DESC NULLS LAST, p_brand#X ASC NULLS FIRST, p_type#X ASC NULLS FIRST, p_size#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] +Arguments: rangepartitioning(supplier_cnt#X DESC NULLS LAST, p_brand#X ASC NULLS FIRST, p_type#X ASC NULLS FIRST, p_size#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [shuffle_writer_type=hash] (42) ShuffleQueryStage Output [4]: [p_brand#X, p_type#X, p_size#X, supplier_cnt#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark33/17.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark33/17.txt index 7e1d41ca42510..ad32d4cb586da 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark33/17.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark33/17.txt @@ -84,7 +84,7 @@ Arguments: X, X (6) ColumnarExchange Input [4]: [hash_partition_key#X, l_partkey#X, l_quantity#X, l_extendedprice#X] -Arguments: hashpartitioning(l_partkey#X, 1), ENSURE_REQUIREMENTS, [l_partkey#X, l_quantity#X, l_extendedprice#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(l_partkey#X, 1), ENSURE_REQUIREMENTS, [l_partkey#X, l_quantity#X, l_extendedprice#X], [plan_id=X], [shuffle_writer_type=hash] (7) ShuffleQueryStage Output [3]: [l_partkey#X, l_quantity#X, l_extendedprice#X] @@ -121,7 +121,7 @@ Arguments: X, X (15) ColumnarExchange Input [2]: [hash_partition_key#X, p_partkey#X] -Arguments: hashpartitioning(p_partkey#X, 1), ENSURE_REQUIREMENTS, [p_partkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(p_partkey#X, 1), ENSURE_REQUIREMENTS, [p_partkey#X], [plan_id=X], [shuffle_writer_type=hash] (16) ShuffleQueryStage Output [1]: [p_partkey#X] @@ -174,7 +174,7 @@ Arguments: X, X (27) ColumnarExchange Input [4]: [hash_partition_key#X, l_partkey#X, sum#X, count#X] -Arguments: hashpartitioning(l_partkey#X, 1), ENSURE_REQUIREMENTS, [l_partkey#X, sum#X, count#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(l_partkey#X, 1), ENSURE_REQUIREMENTS, [l_partkey#X, sum#X, count#X], [plan_id=X], [shuffle_writer_type=hash] (28) ShuffleQueryStage Output [3]: [l_partkey#X, sum#X, count#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark33/18.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark33/18.txt index 42b2e6f762911..42f398883883a 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark33/18.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark33/18.txt @@ -124,7 +124,7 @@ Arguments: X, X (6) ColumnarExchange Input [3]: [hash_partition_key#X, c_custkey#X, c_name#X] -Arguments: hashpartitioning(c_custkey#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X, c_name#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(c_custkey#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X, c_name#X], [plan_id=X], [shuffle_writer_type=hash] (7) ShuffleQueryStage Output [2]: [c_custkey#X, c_name#X] @@ -161,7 +161,7 @@ Arguments: X, X (15) ColumnarExchange Input [5]: [hash_partition_key#X, o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X] -Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X], [plan_id=X], [shuffle_writer_type=hash] (16) ShuffleQueryStage Output [4]: [o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X] @@ -200,7 +200,7 @@ Arguments: X, X (24) ColumnarExchange Input [4]: [hash_partition_key#X, l_orderkey#X, sum#X, isEmpty#X] -Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, sum#X, isEmpty#X], [plan_id=X], [shuffle_writer_type=hash] (25) ShuffleQueryStage Output [3]: [l_orderkey#X, sum#X, isEmpty#X] @@ -246,7 +246,7 @@ Arguments: X, X (35) ColumnarExchange Input [5]: [hash_partition_key#X, o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X] -Arguments: hashpartitioning(o_custkey#X, 1), ENSURE_REQUIREMENTS, [o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(o_custkey#X, 1), ENSURE_REQUIREMENTS, [o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X], [plan_id=X], [shuffle_writer_type=hash] (36) ShuffleQueryStage Output [4]: [o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X] @@ -277,7 +277,7 @@ Arguments: X, X (43) ColumnarExchange Input [6]: [hash_partition_key#X, c_custkey#X, c_name#X, o_orderkey#X, o_totalprice#X, o_orderdate#X] -Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X, c_name#X, o_orderkey#X, o_totalprice#X, o_orderdate#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X, c_name#X, o_orderkey#X, o_totalprice#X, o_orderdate#X], [plan_id=X], [shuffle_writer_type=hash] (44) ShuffleQueryStage Output [5]: [c_custkey#X, c_name#X, o_orderkey#X, o_totalprice#X, o_orderdate#X] @@ -314,7 +314,7 @@ Arguments: X, X (52) ColumnarExchange Input [3]: [hash_partition_key#X, l_orderkey#X, l_quantity#X] -Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_quantity#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_quantity#X], [plan_id=X], [shuffle_writer_type=hash] (53) ShuffleQueryStage Output [2]: [l_orderkey#X, l_quantity#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark33/19.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark33/19.txt index d05ba10a15a65..20fea0a016290 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark33/19.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark33/19.txt @@ -61,7 +61,7 @@ Arguments: X, X (6) ColumnarExchange Input [5]: [hash_partition_key#X, l_partkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] -Arguments: hashpartitioning(l_partkey#X, 1), ENSURE_REQUIREMENTS, [l_partkey#X, l_quantity#X, l_extendedprice#X, l_discount#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(l_partkey#X, 1), ENSURE_REQUIREMENTS, [l_partkey#X, l_quantity#X, l_extendedprice#X, l_discount#X], [plan_id=X], [shuffle_writer_type=hash] (7) ShuffleQueryStage Output [4]: [l_partkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] @@ -98,7 +98,7 @@ Arguments: X, X (15) ColumnarExchange Input [5]: [hash_partition_key#X, p_partkey#X, p_brand#X, p_size#X, p_container#X] -Arguments: hashpartitioning(p_partkey#X, 1), ENSURE_REQUIREMENTS, [p_partkey#X, p_brand#X, p_size#X, p_container#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(p_partkey#X, 1), ENSURE_REQUIREMENTS, [p_partkey#X, p_brand#X, p_size#X, p_container#X], [plan_id=X], [shuffle_writer_type=hash] (16) ShuffleQueryStage Output [4]: [p_partkey#X, p_brand#X, p_size#X, p_container#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark33/20.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark33/20.txt index d74e9ecde9991..fdaed6a1a7e8c 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark33/20.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark33/20.txt @@ -151,7 +151,7 @@ Arguments: X, X (6) ColumnarExchange Input [5]: [hash_partition_key#X, s_suppkey#X, s_name#X, s_address#X, s_nationkey#X] -Arguments: hashpartitioning(s_suppkey#X, 1), ENSURE_REQUIREMENTS, [s_suppkey#X, s_name#X, s_address#X, s_nationkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(s_suppkey#X, 1), ENSURE_REQUIREMENTS, [s_suppkey#X, s_name#X, s_address#X, s_nationkey#X], [plan_id=X], [shuffle_writer_type=hash] (7) ShuffleQueryStage Output [4]: [s_suppkey#X, s_name#X, s_address#X, s_nationkey#X] @@ -188,7 +188,7 @@ Arguments: X, X (15) ColumnarExchange Input [4]: [hash_partition_key#X, ps_partkey#X, ps_suppkey#X, ps_availqty#X] -Arguments: hashpartitioning(ps_partkey#X, 1), ENSURE_REQUIREMENTS, [ps_partkey#X, ps_suppkey#X, ps_availqty#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(ps_partkey#X, 1), ENSURE_REQUIREMENTS, [ps_partkey#X, ps_suppkey#X, ps_availqty#X], [plan_id=X], [shuffle_writer_type=hash] (16) ShuffleQueryStage Output [3]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X] @@ -225,7 +225,7 @@ Arguments: X, X (24) ColumnarExchange Input [2]: [hash_partition_key#X, p_partkey#X] -Arguments: hashpartitioning(p_partkey#X, 1), ENSURE_REQUIREMENTS, [p_partkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(p_partkey#X, 1), ENSURE_REQUIREMENTS, [p_partkey#X], [plan_id=X], [shuffle_writer_type=hash] (25) ShuffleQueryStage Output [1]: [p_partkey#X] @@ -256,7 +256,7 @@ Arguments: X, X (32) ColumnarExchange Input [4]: [hash_partition_key#X, ps_partkey#X, ps_suppkey#X, ps_availqty#X] -Arguments: hashpartitioning(ps_partkey#X, ps_suppkey#X, 1), ENSURE_REQUIREMENTS, [ps_partkey#X, ps_suppkey#X, ps_availqty#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(ps_partkey#X, ps_suppkey#X, 1), ENSURE_REQUIREMENTS, [ps_partkey#X, ps_suppkey#X, ps_availqty#X], [plan_id=X], [shuffle_writer_type=hash] (33) ShuffleQueryStage Output [3]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X] @@ -293,7 +293,7 @@ Arguments: X, X (41) ColumnarExchange Input [4]: [hash_partition_key#X, l_partkey#X, l_suppkey#X, l_quantity#X] -Arguments: hashpartitioning(l_partkey#X, 1), ENSURE_REQUIREMENTS, [l_partkey#X, l_suppkey#X, l_quantity#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(l_partkey#X, 1), ENSURE_REQUIREMENTS, [l_partkey#X, l_suppkey#X, l_quantity#X], [plan_id=X], [shuffle_writer_type=hash] (42) ShuffleQueryStage Output [3]: [l_partkey#X, l_suppkey#X, l_quantity#X] @@ -359,7 +359,7 @@ Arguments: X, X (57) ColumnarExchange Input [4]: [hash_partition_key#X, (0.5 * sum(l_quantity))#X, l_partkey#X, l_suppkey#X] -Arguments: hashpartitioning(l_partkey#X, l_suppkey#X, 1), ENSURE_REQUIREMENTS, [(0.5 * sum(l_quantity))#X, l_partkey#X, l_suppkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(l_partkey#X, l_suppkey#X, 1), ENSURE_REQUIREMENTS, [(0.5 * sum(l_quantity))#X, l_partkey#X, l_suppkey#X], [plan_id=X], [shuffle_writer_type=hash] (58) ShuffleQueryStage Output [3]: [(0.5 * sum(l_quantity))#X, l_partkey#X, l_suppkey#X] @@ -390,7 +390,7 @@ Arguments: X, X (65) ColumnarExchange Input [2]: [hash_partition_key#X, ps_suppkey#X] -Arguments: hashpartitioning(ps_suppkey#X, 1), ENSURE_REQUIREMENTS, [ps_suppkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(ps_suppkey#X, 1), ENSURE_REQUIREMENTS, [ps_suppkey#X], [plan_id=X], [shuffle_writer_type=hash] (66) ShuffleQueryStage Output [1]: [ps_suppkey#X] @@ -421,7 +421,7 @@ Arguments: X, X (73) ColumnarExchange Input [4]: [hash_partition_key#X, s_name#X, s_address#X, s_nationkey#X] -Arguments: hashpartitioning(s_nationkey#X, 1), ENSURE_REQUIREMENTS, [s_name#X, s_address#X, s_nationkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(s_nationkey#X, 1), ENSURE_REQUIREMENTS, [s_name#X, s_address#X, s_nationkey#X], [plan_id=X], [shuffle_writer_type=hash] (74) ShuffleQueryStage Output [3]: [s_name#X, s_address#X, s_nationkey#X] @@ -458,7 +458,7 @@ Arguments: X, X (82) ColumnarExchange Input [2]: [hash_partition_key#X, n_nationkey#X] -Arguments: hashpartitioning(n_nationkey#X, 1), ENSURE_REQUIREMENTS, [n_nationkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(n_nationkey#X, 1), ENSURE_REQUIREMENTS, [n_nationkey#X], [plan_id=X], [shuffle_writer_type=hash] (83) ShuffleQueryStage Output [1]: [n_nationkey#X] @@ -489,7 +489,7 @@ Arguments: X, X (90) ColumnarExchange Input [2]: [s_name#X, s_address#X] -Arguments: rangepartitioning(s_name#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] +Arguments: rangepartitioning(s_name#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [shuffle_writer_type=hash] (91) ShuffleQueryStage Output [2]: [s_name#X, s_address#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark33/21.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark33/21.txt index 36ecd282aa4ff..557dd678da526 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark33/21.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark33/21.txt @@ -144,7 +144,7 @@ Arguments: X, X (6) ColumnarExchange Input [4]: [hash_partition_key#X, s_suppkey#X, s_name#X, s_nationkey#X] -Arguments: hashpartitioning(s_suppkey#X, 1), ENSURE_REQUIREMENTS, [s_suppkey#X, s_name#X, s_nationkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(s_suppkey#X, 1), ENSURE_REQUIREMENTS, [s_suppkey#X, s_name#X, s_nationkey#X], [plan_id=X], [shuffle_writer_type=hash] (7) ShuffleQueryStage Output [3]: [s_suppkey#X, s_name#X, s_nationkey#X] @@ -181,7 +181,7 @@ Arguments: X, X (15) ColumnarExchange Input [3]: [hash_partition_key#X, l_orderkey#X, l_suppkey#X] -Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_suppkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_suppkey#X], [plan_id=X], [shuffle_writer_type=hash] (16) ShuffleQueryStage Output [2]: [l_orderkey#X, l_suppkey#X] @@ -213,7 +213,7 @@ Arguments: X, X (23) ColumnarExchange Input [3]: [hash_partition_key#X, l_orderkey#X, l_suppkey#X] -Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_suppkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_suppkey#X], [plan_id=X], [shuffle_writer_type=hash] (24) ShuffleQueryStage Output [2]: [l_orderkey#X, l_suppkey#X] @@ -255,7 +255,7 @@ Arguments: X, X (33) ColumnarExchange Input [3]: [hash_partition_key#X, l_orderkey#X, l_suppkey#X] -Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_suppkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_suppkey#X], [plan_id=X], [shuffle_writer_type=hash] (34) ShuffleQueryStage Output [2]: [l_orderkey#X, l_suppkey#X] @@ -286,7 +286,7 @@ Arguments: X, X (41) ColumnarExchange Input [3]: [hash_partition_key#X, l_orderkey#X, l_suppkey#X] -Arguments: hashpartitioning(l_suppkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_suppkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(l_suppkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_suppkey#X], [plan_id=X], [shuffle_writer_type=hash] (42) ShuffleQueryStage Output [2]: [l_orderkey#X, l_suppkey#X] @@ -317,7 +317,7 @@ Arguments: X, X (49) ColumnarExchange Input [4]: [hash_partition_key#X, s_name#X, s_nationkey#X, l_orderkey#X] -Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [s_name#X, s_nationkey#X, l_orderkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [s_name#X, s_nationkey#X, l_orderkey#X], [plan_id=X], [shuffle_writer_type=hash] (50) ShuffleQueryStage Output [3]: [s_name#X, s_nationkey#X, l_orderkey#X] @@ -354,7 +354,7 @@ Arguments: X, X (58) ColumnarExchange Input [2]: [hash_partition_key#X, o_orderkey#X] -Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [o_orderkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [o_orderkey#X], [plan_id=X], [shuffle_writer_type=hash] (59) ShuffleQueryStage Output [1]: [o_orderkey#X] @@ -385,7 +385,7 @@ Arguments: X, X (66) ColumnarExchange Input [3]: [hash_partition_key#X, s_name#X, s_nationkey#X] -Arguments: hashpartitioning(s_nationkey#X, 1), ENSURE_REQUIREMENTS, [s_name#X, s_nationkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(s_nationkey#X, 1), ENSURE_REQUIREMENTS, [s_name#X, s_nationkey#X], [plan_id=X], [shuffle_writer_type=hash] (67) ShuffleQueryStage Output [2]: [s_name#X, s_nationkey#X] @@ -422,7 +422,7 @@ Arguments: X, X (75) ColumnarExchange Input [2]: [hash_partition_key#X, n_nationkey#X] -Arguments: hashpartitioning(n_nationkey#X, 1), ENSURE_REQUIREMENTS, [n_nationkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(n_nationkey#X, 1), ENSURE_REQUIREMENTS, [n_nationkey#X], [plan_id=X], [shuffle_writer_type=hash] (76) ShuffleQueryStage Output [1]: [n_nationkey#X] @@ -464,7 +464,7 @@ Arguments: X, X (85) ColumnarExchange Input [3]: [hash_partition_key#X, s_name#X, count#X] -Arguments: hashpartitioning(s_name#X, 1), ENSURE_REQUIREMENTS, [s_name#X, count#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(s_name#X, 1), ENSURE_REQUIREMENTS, [s_name#X, count#X], [plan_id=X], [shuffle_writer_type=hash] (86) ShuffleQueryStage Output [2]: [s_name#X, count#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark33/22.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark33/22.txt index 50a049ea3ce9b..edd50a4764bee 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark33/22.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark33/22.txt @@ -71,7 +71,7 @@ Arguments: X, X (6) ColumnarExchange Input [4]: [hash_partition_key#X, c_custkey#X, c_phone#X, c_acctbal#X] -Arguments: hashpartitioning(c_custkey#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X, c_phone#X, c_acctbal#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(c_custkey#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X, c_phone#X, c_acctbal#X], [plan_id=X], [shuffle_writer_type=hash] (7) ShuffleQueryStage Output [3]: [c_custkey#X, c_phone#X, c_acctbal#X] @@ -103,7 +103,7 @@ Arguments: X, X (14) ColumnarExchange Input [2]: [hash_partition_key#X, o_custkey#X] -Arguments: hashpartitioning(o_custkey#X, 1), ENSURE_REQUIREMENTS, [o_custkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(o_custkey#X, 1), ENSURE_REQUIREMENTS, [o_custkey#X], [plan_id=X], [shuffle_writer_type=hash] (15) ShuffleQueryStage Output [1]: [o_custkey#X] @@ -145,7 +145,7 @@ Arguments: X, X (24) ColumnarExchange Input [5]: [hash_partition_key#X, cntrycode#X, count#X, sum#X, isEmpty#X] -Arguments: hashpartitioning(cntrycode#X, 1), ENSURE_REQUIREMENTS, [cntrycode#X, count#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(cntrycode#X, 1), ENSURE_REQUIREMENTS, [cntrycode#X, count#X, sum#X, isEmpty#X], [plan_id=X], [shuffle_writer_type=hash] (25) ShuffleQueryStage Output [4]: [cntrycode#X, count#X, sum#X, isEmpty#X] @@ -174,7 +174,7 @@ Arguments: X, X (31) ColumnarExchange Input [3]: [cntrycode#X, numcust#X, totacctbal#X] -Arguments: rangepartitioning(cntrycode#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] +Arguments: rangepartitioning(cntrycode#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [shuffle_writer_type=hash] (32) ShuffleQueryStage Output [3]: [cntrycode#X, numcust#X, totacctbal#X] @@ -325,7 +325,7 @@ Arguments: X, X (59) ColumnarExchange Input [2]: [sum#X, count#X] -Arguments: SinglePartition, ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] +Arguments: SinglePartition, ENSURE_REQUIREMENTS, [plan_id=X], [shuffle_writer_type=hash] (60) ShuffleQueryStage Output [2]: [sum#X, count#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark33/3.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark33/3.txt index 4d701845a38af..ea8a29d6594f4 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark33/3.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark33/3.txt @@ -86,7 +86,7 @@ Arguments: X, X (6) ColumnarExchange Input [2]: [hash_partition_key#X, c_custkey#X] -Arguments: hashpartitioning(c_custkey#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(c_custkey#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X], [plan_id=X], [shuffle_writer_type=hash] (7) ShuffleQueryStage Output [1]: [c_custkey#X] @@ -123,7 +123,7 @@ Arguments: X, X (15) ColumnarExchange Input [5]: [hash_partition_key#X, o_orderkey#X, o_custkey#X, o_orderdate#X, o_shippriority#X] -Arguments: hashpartitioning(o_custkey#X, 1), ENSURE_REQUIREMENTS, [o_orderkey#X, o_custkey#X, o_orderdate#X, o_shippriority#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(o_custkey#X, 1), ENSURE_REQUIREMENTS, [o_orderkey#X, o_custkey#X, o_orderdate#X, o_shippriority#X], [plan_id=X], [shuffle_writer_type=hash] (16) ShuffleQueryStage Output [4]: [o_orderkey#X, o_custkey#X, o_orderdate#X, o_shippriority#X] @@ -154,7 +154,7 @@ Arguments: X, X (23) ColumnarExchange Input [4]: [hash_partition_key#X, o_orderkey#X, o_orderdate#X, o_shippriority#X] -Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [o_orderkey#X, o_orderdate#X, o_shippriority#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [o_orderkey#X, o_orderdate#X, o_shippriority#X], [plan_id=X], [shuffle_writer_type=hash] (24) ShuffleQueryStage Output [3]: [o_orderkey#X, o_orderdate#X, o_shippriority#X] @@ -191,7 +191,7 @@ Arguments: X, X (32) ColumnarExchange Input [4]: [hash_partition_key#X, l_orderkey#X, l_extendedprice#X, l_discount#X] -Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_extendedprice#X, l_discount#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_extendedprice#X, l_discount#X], [plan_id=X], [shuffle_writer_type=hash] (33) ShuffleQueryStage Output [3]: [l_orderkey#X, l_extendedprice#X, l_discount#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark33/4.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark33/4.txt index 163bb15a43fa7..ca3f2707b8e73 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark33/4.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark33/4.txt @@ -75,7 +75,7 @@ Arguments: X, X (6) ColumnarExchange Input [3]: [hash_partition_key#X, o_orderkey#X, o_orderpriority#X] -Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [o_orderkey#X, o_orderpriority#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [o_orderkey#X, o_orderpriority#X], [plan_id=X], [shuffle_writer_type=hash] (7) ShuffleQueryStage Output [2]: [o_orderkey#X, o_orderpriority#X] @@ -112,7 +112,7 @@ Arguments: X, X (15) ColumnarExchange Input [2]: [hash_partition_key#X, l_orderkey#X] -Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X], [plan_id=X], [shuffle_writer_type=hash] (16) ShuffleQueryStage Output [1]: [l_orderkey#X] @@ -154,7 +154,7 @@ Arguments: X, X (25) ColumnarExchange Input [3]: [hash_partition_key#X, o_orderpriority#X, count#X] -Arguments: hashpartitioning(o_orderpriority#X, 1), ENSURE_REQUIREMENTS, [o_orderpriority#X, count#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(o_orderpriority#X, 1), ENSURE_REQUIREMENTS, [o_orderpriority#X, count#X], [plan_id=X], [shuffle_writer_type=hash] (26) ShuffleQueryStage Output [2]: [o_orderpriority#X, count#X] @@ -183,7 +183,7 @@ Arguments: X, X (32) ColumnarExchange Input [2]: [o_orderpriority#X, order_count#X] -Arguments: rangepartitioning(o_orderpriority#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] +Arguments: rangepartitioning(o_orderpriority#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [shuffle_writer_type=hash] (33) ShuffleQueryStage Output [2]: [o_orderpriority#X, order_count#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark33/5.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark33/5.txt index 6aadfc753a9af..ccedfac29b46c 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark33/5.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark33/5.txt @@ -159,7 +159,7 @@ Arguments: X, X (6) ColumnarExchange Input [3]: [hash_partition_key#X, c_custkey#X, c_nationkey#X] -Arguments: hashpartitioning(c_custkey#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X, c_nationkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(c_custkey#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X, c_nationkey#X], [plan_id=X], [shuffle_writer_type=hash] (7) ShuffleQueryStage Output [2]: [c_custkey#X, c_nationkey#X] @@ -196,7 +196,7 @@ Arguments: X, X (15) ColumnarExchange Input [3]: [hash_partition_key#X, o_orderkey#X, o_custkey#X] -Arguments: hashpartitioning(o_custkey#X, 1), ENSURE_REQUIREMENTS, [o_orderkey#X, o_custkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(o_custkey#X, 1), ENSURE_REQUIREMENTS, [o_orderkey#X, o_custkey#X], [plan_id=X], [shuffle_writer_type=hash] (16) ShuffleQueryStage Output [2]: [o_orderkey#X, o_custkey#X] @@ -227,7 +227,7 @@ Arguments: X, X (23) ColumnarExchange Input [3]: [hash_partition_key#X, c_nationkey#X, o_orderkey#X] -Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [c_nationkey#X, o_orderkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [c_nationkey#X, o_orderkey#X], [plan_id=X], [shuffle_writer_type=hash] (24) ShuffleQueryStage Output [2]: [c_nationkey#X, o_orderkey#X] @@ -264,7 +264,7 @@ Arguments: X, X (32) ColumnarExchange Input [5]: [hash_partition_key#X, l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] -Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X], [plan_id=X], [shuffle_writer_type=hash] (33) ShuffleQueryStage Output [4]: [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] @@ -295,7 +295,7 @@ Arguments: X, X (40) ColumnarExchange Input [5]: [hash_partition_key#X, c_nationkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] -Arguments: hashpartitioning(l_suppkey#X, c_nationkey#X, 1), ENSURE_REQUIREMENTS, [c_nationkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(l_suppkey#X, c_nationkey#X, 1), ENSURE_REQUIREMENTS, [c_nationkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X], [plan_id=X], [shuffle_writer_type=hash] (41) ShuffleQueryStage Output [4]: [c_nationkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] @@ -332,7 +332,7 @@ Arguments: X, X (49) ColumnarExchange Input [3]: [hash_partition_key#X, s_suppkey#X, s_nationkey#X] -Arguments: hashpartitioning(s_suppkey#X, s_nationkey#X, 1), ENSURE_REQUIREMENTS, [s_suppkey#X, s_nationkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(s_suppkey#X, s_nationkey#X, 1), ENSURE_REQUIREMENTS, [s_suppkey#X, s_nationkey#X], [plan_id=X], [shuffle_writer_type=hash] (50) ShuffleQueryStage Output [2]: [s_suppkey#X, s_nationkey#X] @@ -363,7 +363,7 @@ Arguments: X, X (57) ColumnarExchange Input [4]: [hash_partition_key#X, l_extendedprice#X, l_discount#X, s_nationkey#X] -Arguments: hashpartitioning(s_nationkey#X, 1), ENSURE_REQUIREMENTS, [l_extendedprice#X, l_discount#X, s_nationkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(s_nationkey#X, 1), ENSURE_REQUIREMENTS, [l_extendedprice#X, l_discount#X, s_nationkey#X], [plan_id=X], [shuffle_writer_type=hash] (58) ShuffleQueryStage Output [3]: [l_extendedprice#X, l_discount#X, s_nationkey#X] @@ -400,7 +400,7 @@ Arguments: X, X (66) ColumnarExchange Input [4]: [hash_partition_key#X, n_nationkey#X, n_name#X, n_regionkey#X] -Arguments: hashpartitioning(n_nationkey#X, 1), ENSURE_REQUIREMENTS, [n_nationkey#X, n_name#X, n_regionkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(n_nationkey#X, 1), ENSURE_REQUIREMENTS, [n_nationkey#X, n_name#X, n_regionkey#X], [plan_id=X], [shuffle_writer_type=hash] (67) ShuffleQueryStage Output [3]: [n_nationkey#X, n_name#X, n_regionkey#X] @@ -431,7 +431,7 @@ Arguments: X, X (74) ColumnarExchange Input [5]: [hash_partition_key#X, l_extendedprice#X, l_discount#X, n_name#X, n_regionkey#X] -Arguments: hashpartitioning(n_regionkey#X, 1), ENSURE_REQUIREMENTS, [l_extendedprice#X, l_discount#X, n_name#X, n_regionkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(n_regionkey#X, 1), ENSURE_REQUIREMENTS, [l_extendedprice#X, l_discount#X, n_name#X, n_regionkey#X], [plan_id=X], [shuffle_writer_type=hash] (75) ShuffleQueryStage Output [4]: [l_extendedprice#X, l_discount#X, n_name#X, n_regionkey#X] @@ -468,7 +468,7 @@ Arguments: X, X (83) ColumnarExchange Input [2]: [hash_partition_key#X, r_regionkey#X] -Arguments: hashpartitioning(r_regionkey#X, 1), ENSURE_REQUIREMENTS, [r_regionkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(r_regionkey#X, 1), ENSURE_REQUIREMENTS, [r_regionkey#X], [plan_id=X], [shuffle_writer_type=hash] (84) ShuffleQueryStage Output [1]: [r_regionkey#X] @@ -510,7 +510,7 @@ Arguments: X, X (93) ColumnarExchange Input [4]: [hash_partition_key#X, n_name#X, sum#X, isEmpty#X] -Arguments: hashpartitioning(n_name#X, 1), ENSURE_REQUIREMENTS, [n_name#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(n_name#X, 1), ENSURE_REQUIREMENTS, [n_name#X, sum#X, isEmpty#X], [plan_id=X], [shuffle_writer_type=hash] (94) ShuffleQueryStage Output [3]: [n_name#X, sum#X, isEmpty#X] @@ -539,7 +539,7 @@ Arguments: X, X (100) ColumnarExchange Input [2]: [n_name#X, revenue#X] -Arguments: rangepartitioning(revenue#X DESC NULLS LAST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] +Arguments: rangepartitioning(revenue#X DESC NULLS LAST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [shuffle_writer_type=hash] (101) ShuffleQueryStage Output [2]: [n_name#X, revenue#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark33/6.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark33/6.txt index 864cdf3832e13..bb9d04cdcdc78 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark33/6.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark33/6.txt @@ -52,7 +52,7 @@ Arguments: X, X (7) ColumnarExchange Input [2]: [sum#X, isEmpty#X] -Arguments: SinglePartition, ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] +Arguments: SinglePartition, ENSURE_REQUIREMENTS, [plan_id=X], [shuffle_writer_type=hash] (8) ShuffleQueryStage Output [2]: [sum#X, isEmpty#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark33/7.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark33/7.txt index 85a7dbf545a47..046147da8ef83 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark33/7.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark33/7.txt @@ -153,7 +153,7 @@ Arguments: X, X (6) ColumnarExchange Input [3]: [hash_partition_key#X, s_suppkey#X, s_nationkey#X] -Arguments: hashpartitioning(s_suppkey#X, 1), ENSURE_REQUIREMENTS, [s_suppkey#X, s_nationkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(s_suppkey#X, 1), ENSURE_REQUIREMENTS, [s_suppkey#X, s_nationkey#X], [plan_id=X], [shuffle_writer_type=hash] (7) ShuffleQueryStage Output [2]: [s_suppkey#X, s_nationkey#X] @@ -190,7 +190,7 @@ Arguments: X, X (15) ColumnarExchange Input [6]: [hash_partition_key#X, l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] -Arguments: hashpartitioning(l_suppkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(l_suppkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X], [plan_id=X], [shuffle_writer_type=hash] (16) ShuffleQueryStage Output [5]: [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] @@ -221,7 +221,7 @@ Arguments: X, X (23) ColumnarExchange Input [6]: [hash_partition_key#X, s_nationkey#X, l_orderkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] -Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [s_nationkey#X, l_orderkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [s_nationkey#X, l_orderkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X], [plan_id=X], [shuffle_writer_type=hash] (24) ShuffleQueryStage Output [5]: [s_nationkey#X, l_orderkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] @@ -258,7 +258,7 @@ Arguments: X, X (32) ColumnarExchange Input [3]: [hash_partition_key#X, o_orderkey#X, o_custkey#X] -Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [o_orderkey#X, o_custkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [o_orderkey#X, o_custkey#X], [plan_id=X], [shuffle_writer_type=hash] (33) ShuffleQueryStage Output [2]: [o_orderkey#X, o_custkey#X] @@ -289,7 +289,7 @@ Arguments: X, X (40) ColumnarExchange Input [6]: [hash_partition_key#X, s_nationkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, o_custkey#X] -Arguments: hashpartitioning(o_custkey#X, 1), ENSURE_REQUIREMENTS, [s_nationkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, o_custkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(o_custkey#X, 1), ENSURE_REQUIREMENTS, [s_nationkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, o_custkey#X], [plan_id=X], [shuffle_writer_type=hash] (41) ShuffleQueryStage Output [5]: [s_nationkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, o_custkey#X] @@ -326,7 +326,7 @@ Arguments: X, X (49) ColumnarExchange Input [3]: [hash_partition_key#X, c_custkey#X, c_nationkey#X] -Arguments: hashpartitioning(c_custkey#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X, c_nationkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(c_custkey#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X, c_nationkey#X], [plan_id=X], [shuffle_writer_type=hash] (50) ShuffleQueryStage Output [2]: [c_custkey#X, c_nationkey#X] @@ -357,7 +357,7 @@ Arguments: X, X (57) ColumnarExchange Input [6]: [hash_partition_key#X, s_nationkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, c_nationkey#X] -Arguments: hashpartitioning(s_nationkey#X, 1), ENSURE_REQUIREMENTS, [s_nationkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, c_nationkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(s_nationkey#X, 1), ENSURE_REQUIREMENTS, [s_nationkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, c_nationkey#X], [plan_id=X], [shuffle_writer_type=hash] (58) ShuffleQueryStage Output [5]: [s_nationkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, c_nationkey#X] @@ -394,7 +394,7 @@ Arguments: X, X (66) ColumnarExchange Input [3]: [hash_partition_key#X, n_nationkey#X, n_name#X] -Arguments: hashpartitioning(n_nationkey#X, 1), ENSURE_REQUIREMENTS, [n_nationkey#X, n_name#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(n_nationkey#X, 1), ENSURE_REQUIREMENTS, [n_nationkey#X, n_name#X], [plan_id=X], [shuffle_writer_type=hash] (67) ShuffleQueryStage Output [2]: [n_nationkey#X, n_name#X] @@ -425,7 +425,7 @@ Arguments: X, X (74) ColumnarExchange Input [6]: [hash_partition_key#X, l_extendedprice#X, l_discount#X, l_shipdate#X, c_nationkey#X, n_name#X] -Arguments: hashpartitioning(c_nationkey#X, 1), ENSURE_REQUIREMENTS, [l_extendedprice#X, l_discount#X, l_shipdate#X, c_nationkey#X, n_name#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(c_nationkey#X, 1), ENSURE_REQUIREMENTS, [l_extendedprice#X, l_discount#X, l_shipdate#X, c_nationkey#X, n_name#X], [plan_id=X], [shuffle_writer_type=hash] (75) ShuffleQueryStage Output [5]: [l_extendedprice#X, l_discount#X, l_shipdate#X, c_nationkey#X, n_name#X] @@ -480,7 +480,7 @@ Arguments: X, X (88) ColumnarExchange Input [6]: [hash_partition_key#X, supp_nation#X, cust_nation#X, l_year#X, sum#X, isEmpty#X] -Arguments: hashpartitioning(supp_nation#X, cust_nation#X, l_year#X, 1), ENSURE_REQUIREMENTS, [supp_nation#X, cust_nation#X, l_year#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(supp_nation#X, cust_nation#X, l_year#X, 1), ENSURE_REQUIREMENTS, [supp_nation#X, cust_nation#X, l_year#X, sum#X, isEmpty#X], [plan_id=X], [shuffle_writer_type=hash] (89) ShuffleQueryStage Output [5]: [supp_nation#X, cust_nation#X, l_year#X, sum#X, isEmpty#X] @@ -509,7 +509,7 @@ Arguments: X, X (95) ColumnarExchange Input [4]: [supp_nation#X, cust_nation#X, l_year#X, revenue#X] -Arguments: rangepartitioning(supp_nation#X ASC NULLS FIRST, cust_nation#X ASC NULLS FIRST, l_year#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] +Arguments: rangepartitioning(supp_nation#X ASC NULLS FIRST, cust_nation#X ASC NULLS FIRST, l_year#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [shuffle_writer_type=hash] (96) ShuffleQueryStage Output [4]: [supp_nation#X, cust_nation#X, l_year#X, revenue#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark33/8.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark33/8.txt index 88597b61d0eeb..664d74c619856 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark33/8.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark33/8.txt @@ -202,7 +202,7 @@ Arguments: X, X (6) ColumnarExchange Input [2]: [hash_partition_key#X, p_partkey#X] -Arguments: hashpartitioning(p_partkey#X, 1), ENSURE_REQUIREMENTS, [p_partkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(p_partkey#X, 1), ENSURE_REQUIREMENTS, [p_partkey#X], [plan_id=X], [shuffle_writer_type=hash] (7) ShuffleQueryStage Output [1]: [p_partkey#X] @@ -239,7 +239,7 @@ Arguments: X, X (15) ColumnarExchange Input [6]: [hash_partition_key#X, l_orderkey#X, l_partkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] -Arguments: hashpartitioning(l_partkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_partkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(l_partkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_partkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X], [plan_id=X], [shuffle_writer_type=hash] (16) ShuffleQueryStage Output [5]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] @@ -270,7 +270,7 @@ Arguments: X, X (23) ColumnarExchange Input [5]: [hash_partition_key#X, l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] -Arguments: hashpartitioning(l_suppkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(l_suppkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X], [plan_id=X], [shuffle_writer_type=hash] (24) ShuffleQueryStage Output [4]: [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] @@ -307,7 +307,7 @@ Arguments: X, X (32) ColumnarExchange Input [3]: [hash_partition_key#X, s_suppkey#X, s_nationkey#X] -Arguments: hashpartitioning(s_suppkey#X, 1), ENSURE_REQUIREMENTS, [s_suppkey#X, s_nationkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(s_suppkey#X, 1), ENSURE_REQUIREMENTS, [s_suppkey#X, s_nationkey#X], [plan_id=X], [shuffle_writer_type=hash] (33) ShuffleQueryStage Output [2]: [s_suppkey#X, s_nationkey#X] @@ -338,7 +338,7 @@ Arguments: X, X (40) ColumnarExchange Input [5]: [hash_partition_key#X, l_orderkey#X, l_extendedprice#X, l_discount#X, s_nationkey#X] -Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_extendedprice#X, l_discount#X, s_nationkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_extendedprice#X, l_discount#X, s_nationkey#X], [plan_id=X], [shuffle_writer_type=hash] (41) ShuffleQueryStage Output [4]: [l_orderkey#X, l_extendedprice#X, l_discount#X, s_nationkey#X] @@ -375,7 +375,7 @@ Arguments: X, X (49) ColumnarExchange Input [4]: [hash_partition_key#X, o_orderkey#X, o_custkey#X, o_orderdate#X] -Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [o_orderkey#X, o_custkey#X, o_orderdate#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [o_orderkey#X, o_custkey#X, o_orderdate#X], [plan_id=X], [shuffle_writer_type=hash] (50) ShuffleQueryStage Output [3]: [o_orderkey#X, o_custkey#X, o_orderdate#X] @@ -406,7 +406,7 @@ Arguments: X, X (57) ColumnarExchange Input [6]: [hash_partition_key#X, l_extendedprice#X, l_discount#X, s_nationkey#X, o_custkey#X, o_orderdate#X] -Arguments: hashpartitioning(o_custkey#X, 1), ENSURE_REQUIREMENTS, [l_extendedprice#X, l_discount#X, s_nationkey#X, o_custkey#X, o_orderdate#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(o_custkey#X, 1), ENSURE_REQUIREMENTS, [l_extendedprice#X, l_discount#X, s_nationkey#X, o_custkey#X, o_orderdate#X], [plan_id=X], [shuffle_writer_type=hash] (58) ShuffleQueryStage Output [5]: [l_extendedprice#X, l_discount#X, s_nationkey#X, o_custkey#X, o_orderdate#X] @@ -443,7 +443,7 @@ Arguments: X, X (66) ColumnarExchange Input [3]: [hash_partition_key#X, c_custkey#X, c_nationkey#X] -Arguments: hashpartitioning(c_custkey#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X, c_nationkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(c_custkey#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X, c_nationkey#X], [plan_id=X], [shuffle_writer_type=hash] (67) ShuffleQueryStage Output [2]: [c_custkey#X, c_nationkey#X] @@ -474,7 +474,7 @@ Arguments: X, X (74) ColumnarExchange Input [6]: [hash_partition_key#X, l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderdate#X, c_nationkey#X] -Arguments: hashpartitioning(c_nationkey#X, 1), ENSURE_REQUIREMENTS, [l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderdate#X, c_nationkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(c_nationkey#X, 1), ENSURE_REQUIREMENTS, [l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderdate#X, c_nationkey#X], [plan_id=X], [shuffle_writer_type=hash] (75) ShuffleQueryStage Output [5]: [l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderdate#X, c_nationkey#X] @@ -511,7 +511,7 @@ Arguments: X, X (83) ColumnarExchange Input [3]: [hash_partition_key#X, n_nationkey#X, n_regionkey#X] -Arguments: hashpartitioning(n_nationkey#X, 1), ENSURE_REQUIREMENTS, [n_nationkey#X, n_regionkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(n_nationkey#X, 1), ENSURE_REQUIREMENTS, [n_nationkey#X, n_regionkey#X], [plan_id=X], [shuffle_writer_type=hash] (84) ShuffleQueryStage Output [2]: [n_nationkey#X, n_regionkey#X] @@ -542,7 +542,7 @@ Arguments: X, X (91) ColumnarExchange Input [6]: [hash_partition_key#X, l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderdate#X, n_regionkey#X] -Arguments: hashpartitioning(s_nationkey#X, 1), ENSURE_REQUIREMENTS, [l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderdate#X, n_regionkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(s_nationkey#X, 1), ENSURE_REQUIREMENTS, [l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderdate#X, n_regionkey#X], [plan_id=X], [shuffle_writer_type=hash] (92) ShuffleQueryStage Output [5]: [l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderdate#X, n_regionkey#X] @@ -579,7 +579,7 @@ Arguments: X, X (100) ColumnarExchange Input [3]: [hash_partition_key#X, n_nationkey#X, n_name#X] -Arguments: hashpartitioning(n_nationkey#X, 1), ENSURE_REQUIREMENTS, [n_nationkey#X, n_name#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(n_nationkey#X, 1), ENSURE_REQUIREMENTS, [n_nationkey#X, n_name#X], [plan_id=X], [shuffle_writer_type=hash] (101) ShuffleQueryStage Output [2]: [n_nationkey#X, n_name#X] @@ -610,7 +610,7 @@ Arguments: X, X (108) ColumnarExchange Input [6]: [hash_partition_key#X, l_extendedprice#X, l_discount#X, o_orderdate#X, n_regionkey#X, n_name#X] -Arguments: hashpartitioning(n_regionkey#X, 1), ENSURE_REQUIREMENTS, [l_extendedprice#X, l_discount#X, o_orderdate#X, n_regionkey#X, n_name#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(n_regionkey#X, 1), ENSURE_REQUIREMENTS, [l_extendedprice#X, l_discount#X, o_orderdate#X, n_regionkey#X, n_name#X], [plan_id=X], [shuffle_writer_type=hash] (109) ShuffleQueryStage Output [5]: [l_extendedprice#X, l_discount#X, o_orderdate#X, n_regionkey#X, n_name#X] @@ -647,7 +647,7 @@ Arguments: X, X (117) ColumnarExchange Input [2]: [hash_partition_key#X, r_regionkey#X] -Arguments: hashpartitioning(r_regionkey#X, 1), ENSURE_REQUIREMENTS, [r_regionkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(r_regionkey#X, 1), ENSURE_REQUIREMENTS, [r_regionkey#X], [plan_id=X], [shuffle_writer_type=hash] (118) ShuffleQueryStage Output [1]: [r_regionkey#X] @@ -689,7 +689,7 @@ Arguments: X, X (127) ColumnarExchange Input [6]: [hash_partition_key#X, o_year#X, sum#X, isEmpty#X, sum#X, isEmpty#X] -Arguments: hashpartitioning(o_year#X, 1), ENSURE_REQUIREMENTS, [o_year#X, sum#X, isEmpty#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(o_year#X, 1), ENSURE_REQUIREMENTS, [o_year#X, sum#X, isEmpty#X, sum#X, isEmpty#X], [plan_id=X], [shuffle_writer_type=hash] (128) ShuffleQueryStage Output [5]: [o_year#X, sum#X, isEmpty#X, sum#X, isEmpty#X] @@ -722,7 +722,7 @@ Arguments: X, X (135) ColumnarExchange Input [2]: [o_year#X, mkt_share#X] -Arguments: rangepartitioning(o_year#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] +Arguments: rangepartitioning(o_year#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [shuffle_writer_type=hash] (136) ShuffleQueryStage Output [2]: [o_year#X, mkt_share#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark33/9.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark33/9.txt index 884d1d3563b76..1ae7a93998dca 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark33/9.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark33/9.txt @@ -158,7 +158,7 @@ Arguments: X, X (6) ColumnarExchange Input [2]: [hash_partition_key#X, p_partkey#X] -Arguments: hashpartitioning(p_partkey#X, 1), ENSURE_REQUIREMENTS, [p_partkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(p_partkey#X, 1), ENSURE_REQUIREMENTS, [p_partkey#X], [plan_id=X], [shuffle_writer_type=hash] (7) ShuffleQueryStage Output [1]: [p_partkey#X] @@ -195,7 +195,7 @@ Arguments: X, X (15) ColumnarExchange Input [7]: [hash_partition_key#X, l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] -Arguments: hashpartitioning(l_partkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(l_partkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X], [plan_id=X], [shuffle_writer_type=hash] (16) ShuffleQueryStage Output [6]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] @@ -226,7 +226,7 @@ Arguments: X, X (23) ColumnarExchange Input [7]: [hash_partition_key#X, l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] -Arguments: hashpartitioning(l_suppkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(l_suppkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X], [plan_id=X], [shuffle_writer_type=hash] (24) ShuffleQueryStage Output [6]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] @@ -263,7 +263,7 @@ Arguments: X, X (32) ColumnarExchange Input [3]: [hash_partition_key#X, s_suppkey#X, s_nationkey#X] -Arguments: hashpartitioning(s_suppkey#X, 1), ENSURE_REQUIREMENTS, [s_suppkey#X, s_nationkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(s_suppkey#X, 1), ENSURE_REQUIREMENTS, [s_suppkey#X, s_nationkey#X], [plan_id=X], [shuffle_writer_type=hash] (33) ShuffleQueryStage Output [2]: [s_suppkey#X, s_nationkey#X] @@ -294,7 +294,7 @@ Arguments: X, X (40) ColumnarExchange Input [8]: [hash_partition_key#X, l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X] -Arguments: hashpartitioning(l_suppkey#X, l_partkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(l_suppkey#X, l_partkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X], [plan_id=X], [shuffle_writer_type=hash] (41) ShuffleQueryStage Output [7]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X] @@ -331,7 +331,7 @@ Arguments: X, X (49) ColumnarExchange Input [4]: [hash_partition_key#X, ps_partkey#X, ps_suppkey#X, ps_supplycost#X] -Arguments: hashpartitioning(ps_suppkey#X, ps_partkey#X, 1), ENSURE_REQUIREMENTS, [ps_partkey#X, ps_suppkey#X, ps_supplycost#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(ps_suppkey#X, ps_partkey#X, 1), ENSURE_REQUIREMENTS, [ps_partkey#X, ps_suppkey#X, ps_supplycost#X], [plan_id=X], [shuffle_writer_type=hash] (50) ShuffleQueryStage Output [3]: [ps_partkey#X, ps_suppkey#X, ps_supplycost#X] @@ -362,7 +362,7 @@ Arguments: X, X (57) ColumnarExchange Input [7]: [hash_partition_key#X, l_orderkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_supplycost#X] -Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_supplycost#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_supplycost#X], [plan_id=X], [shuffle_writer_type=hash] (58) ShuffleQueryStage Output [6]: [l_orderkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_supplycost#X] @@ -399,7 +399,7 @@ Arguments: X, X (66) ColumnarExchange Input [3]: [hash_partition_key#X, o_orderkey#X, o_orderdate#X] -Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [o_orderkey#X, o_orderdate#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [o_orderkey#X, o_orderdate#X], [plan_id=X], [shuffle_writer_type=hash] (67) ShuffleQueryStage Output [2]: [o_orderkey#X, o_orderdate#X] @@ -430,7 +430,7 @@ Arguments: X, X (74) ColumnarExchange Input [7]: [hash_partition_key#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_supplycost#X, o_orderdate#X] -Arguments: hashpartitioning(s_nationkey#X, 1), ENSURE_REQUIREMENTS, [l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_supplycost#X, o_orderdate#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(s_nationkey#X, 1), ENSURE_REQUIREMENTS, [l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_supplycost#X, o_orderdate#X], [plan_id=X], [shuffle_writer_type=hash] (75) ShuffleQueryStage Output [6]: [l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_supplycost#X, o_orderdate#X] @@ -467,7 +467,7 @@ Arguments: X, X (83) ColumnarExchange Input [3]: [hash_partition_key#X, n_nationkey#X, n_name#X] -Arguments: hashpartitioning(n_nationkey#X, 1), ENSURE_REQUIREMENTS, [n_nationkey#X, n_name#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(n_nationkey#X, 1), ENSURE_REQUIREMENTS, [n_nationkey#X, n_name#X], [plan_id=X], [shuffle_writer_type=hash] (84) ShuffleQueryStage Output [2]: [n_nationkey#X, n_name#X] @@ -509,7 +509,7 @@ Arguments: X, X (93) ColumnarExchange Input [5]: [hash_partition_key#X, nation#X, o_year#X, sum#X, isEmpty#X] -Arguments: hashpartitioning(nation#X, o_year#X, 1), ENSURE_REQUIREMENTS, [nation#X, o_year#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(nation#X, o_year#X, 1), ENSURE_REQUIREMENTS, [nation#X, o_year#X, sum#X, isEmpty#X], [plan_id=X], [shuffle_writer_type=hash] (94) ShuffleQueryStage Output [4]: [nation#X, o_year#X, sum#X, isEmpty#X] @@ -538,7 +538,7 @@ Arguments: X, X (100) ColumnarExchange Input [3]: [nation#X, o_year#X, sum_profit#X] -Arguments: rangepartitioning(nation#X ASC NULLS FIRST, o_year#X DESC NULLS LAST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] +Arguments: rangepartitioning(nation#X ASC NULLS FIRST, o_year#X DESC NULLS LAST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [shuffle_writer_type=hash] (101) ShuffleQueryStage Output [3]: [nation#X, o_year#X, sum_profit#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark34/1.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark34/1.txt index 0c773785eafce..450797d3aefd9 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark34/1.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark34/1.txt @@ -64,7 +64,7 @@ Arguments: X, X (8) ColumnarExchange Input [18]: [hash_partition_key#X, l_returnflag#X, l_linestatus#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, count#X, sum#X, count#X, sum#X, count#X, count#X] -Arguments: hashpartitioning(l_returnflag#X, l_linestatus#X, 1), ENSURE_REQUIREMENTS, [l_returnflag#X, l_linestatus#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, count#X, sum#X, count#X, sum#X, count#X, count#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(l_returnflag#X, l_linestatus#X, 1), ENSURE_REQUIREMENTS, [l_returnflag#X, l_linestatus#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, count#X, sum#X, count#X, sum#X, count#X, count#X], [plan_id=X], [shuffle_writer_type=hash] (9) ShuffleQueryStage Output [17]: [l_returnflag#X, l_linestatus#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, isEmpty#X, sum#X, count#X, sum#X, count#X, sum#X, count#X, count#X] @@ -93,7 +93,7 @@ Arguments: X, X (15) ColumnarExchange Input [10]: [l_returnflag#X, l_linestatus#X, sum_qty#X, sum_base_price#X, sum_disc_price#X, sum_charge#X, avg_qty#X, avg_price#X, avg_disc#X, count_order#X] -Arguments: rangepartitioning(l_returnflag#X ASC NULLS FIRST, l_linestatus#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] +Arguments: rangepartitioning(l_returnflag#X ASC NULLS FIRST, l_linestatus#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [shuffle_writer_type=hash] (16) ShuffleQueryStage Output [10]: [l_returnflag#X, l_linestatus#X, sum_qty#X, sum_base_price#X, sum_disc_price#X, sum_charge#X, avg_qty#X, avg_price#X, avg_disc#X, count_order#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark34/10.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark34/10.txt index 3d4d5db424cd6..6a296c163eb3f 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark34/10.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark34/10.txt @@ -113,7 +113,7 @@ Arguments: X, X (6) ColumnarExchange Input [8]: [hash_partition_key#X, c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X] -Arguments: hashpartitioning(c_custkey#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(c_custkey#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X], [plan_id=X], [shuffle_writer_type=hash] (7) ShuffleQueryStage Output [7]: [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X] @@ -150,7 +150,7 @@ Arguments: X, X (15) ColumnarExchange Input [3]: [hash_partition_key#X, o_orderkey#X, o_custkey#X] -Arguments: hashpartitioning(o_custkey#X, 1), ENSURE_REQUIREMENTS, [o_orderkey#X, o_custkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(o_custkey#X, 1), ENSURE_REQUIREMENTS, [o_orderkey#X, o_custkey#X], [plan_id=X], [shuffle_writer_type=hash] (16) ShuffleQueryStage Output [2]: [o_orderkey#X, o_custkey#X] @@ -182,7 +182,7 @@ Arguments: X, X (23) ColumnarExchange Input [9]: [hash_partition_key#X, c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, o_orderkey#X] -Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, o_orderkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, o_orderkey#X], [plan_id=X], [shuffle_writer_type=hash] (24) ShuffleQueryStage Output [8]: [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, o_orderkey#X] @@ -219,7 +219,7 @@ Arguments: X, X (32) ColumnarExchange Input [4]: [hash_partition_key#X, l_orderkey#X, l_extendedprice#X, l_discount#X] -Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_extendedprice#X, l_discount#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_extendedprice#X, l_discount#X], [plan_id=X], [shuffle_writer_type=hash] (33) ShuffleQueryStage Output [3]: [l_orderkey#X, l_extendedprice#X, l_discount#X] @@ -251,7 +251,7 @@ Arguments: X, X (40) ColumnarExchange Input [10]: [hash_partition_key#X, c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, l_extendedprice#X, l_discount#X] -Arguments: hashpartitioning(c_nationkey#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, l_extendedprice#X, l_discount#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(c_nationkey#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, l_extendedprice#X, l_discount#X], [plan_id=X], [shuffle_writer_type=hash] (41) ShuffleQueryStage Output [9]: [c_custkey#X, c_name#X, c_address#X, c_nationkey#X, c_phone#X, c_acctbal#X, c_comment#X, l_extendedprice#X, l_discount#X] @@ -288,7 +288,7 @@ Arguments: X, X (49) ColumnarExchange Input [3]: [hash_partition_key#X, n_nationkey#X, n_name#X] -Arguments: hashpartitioning(n_nationkey#X, 1), ENSURE_REQUIREMENTS, [n_nationkey#X, n_name#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(n_nationkey#X, 1), ENSURE_REQUIREMENTS, [n_nationkey#X, n_name#X], [plan_id=X], [shuffle_writer_type=hash] (50) ShuffleQueryStage Output [2]: [n_nationkey#X, n_name#X] @@ -331,7 +331,7 @@ Arguments: X, X (59) ColumnarExchange Input [10]: [hash_partition_key#X, c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, sum#X, isEmpty#X] -Arguments: hashpartitioning(c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, sum#X, isEmpty#X], [plan_id=X], [shuffle_writer_type=hash] (60) ShuffleQueryStage Output [9]: [c_custkey#X, c_name#X, c_acctbal#X, c_phone#X, n_name#X, c_address#X, c_comment#X, sum#X, isEmpty#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark34/11.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark34/11.txt index 8dce41dc2898a..8fef547f8d1f4 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark34/11.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark34/11.txt @@ -97,7 +97,7 @@ Arguments: X, X (6) ColumnarExchange Input [5]: [hash_partition_key#X, ps_partkey#X, ps_suppkey#X, ps_availqty#X, ps_supplycost#X] -Arguments: hashpartitioning(ps_suppkey#X, 1), ENSURE_REQUIREMENTS, [ps_partkey#X, ps_suppkey#X, ps_availqty#X, ps_supplycost#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(ps_suppkey#X, 1), ENSURE_REQUIREMENTS, [ps_partkey#X, ps_suppkey#X, ps_availqty#X, ps_supplycost#X], [plan_id=X], [shuffle_writer_type=hash] (7) ShuffleQueryStage Output [4]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X, ps_supplycost#X] @@ -134,7 +134,7 @@ Arguments: X, X (15) ColumnarExchange Input [3]: [hash_partition_key#X, s_suppkey#X, s_nationkey#X] -Arguments: hashpartitioning(s_suppkey#X, 1), ENSURE_REQUIREMENTS, [s_suppkey#X, s_nationkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(s_suppkey#X, 1), ENSURE_REQUIREMENTS, [s_suppkey#X, s_nationkey#X], [plan_id=X], [shuffle_writer_type=hash] (16) ShuffleQueryStage Output [2]: [s_suppkey#X, s_nationkey#X] @@ -166,7 +166,7 @@ Arguments: X, X (23) ColumnarExchange Input [5]: [hash_partition_key#X, ps_partkey#X, ps_availqty#X, ps_supplycost#X, s_nationkey#X] -Arguments: hashpartitioning(s_nationkey#X, 1), ENSURE_REQUIREMENTS, [ps_partkey#X, ps_availqty#X, ps_supplycost#X, s_nationkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(s_nationkey#X, 1), ENSURE_REQUIREMENTS, [ps_partkey#X, ps_availqty#X, ps_supplycost#X, s_nationkey#X], [plan_id=X], [shuffle_writer_type=hash] (24) ShuffleQueryStage Output [4]: [ps_partkey#X, ps_availqty#X, ps_supplycost#X, s_nationkey#X] @@ -203,7 +203,7 @@ Arguments: X, X (32) ColumnarExchange Input [2]: [hash_partition_key#X, n_nationkey#X] -Arguments: hashpartitioning(n_nationkey#X, 1), ENSURE_REQUIREMENTS, [n_nationkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(n_nationkey#X, 1), ENSURE_REQUIREMENTS, [n_nationkey#X], [plan_id=X], [shuffle_writer_type=hash] (33) ShuffleQueryStage Output [1]: [n_nationkey#X] @@ -246,7 +246,7 @@ Arguments: X, X (42) ColumnarExchange Input [4]: [hash_partition_key#X, ps_partkey#X, sum#X, isEmpty#X] -Arguments: hashpartitioning(ps_partkey#X, 1), ENSURE_REQUIREMENTS, [ps_partkey#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(ps_partkey#X, 1), ENSURE_REQUIREMENTS, [ps_partkey#X, sum#X, isEmpty#X], [plan_id=X], [shuffle_writer_type=hash] (43) ShuffleQueryStage Output [3]: [ps_partkey#X, sum#X, isEmpty#X] @@ -279,7 +279,7 @@ Arguments: X, X (50) ColumnarExchange Input [2]: [ps_partkey#X, value#X] -Arguments: rangepartitioning(value#X DESC NULLS LAST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] +Arguments: rangepartitioning(value#X DESC NULLS LAST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [shuffle_writer_type=hash] (51) ShuffleQueryStage Output [2]: [ps_partkey#X, value#X] @@ -504,7 +504,7 @@ Arguments: X, X (88) ColumnarExchange Input [4]: [hash_partition_key#X, ps_suppkey#X, ps_availqty#X, ps_supplycost#X] -Arguments: hashpartitioning(ps_suppkey#X, 1), ENSURE_REQUIREMENTS, [ps_suppkey#X, ps_availqty#X, ps_supplycost#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(ps_suppkey#X, 1), ENSURE_REQUIREMENTS, [ps_suppkey#X, ps_availqty#X, ps_supplycost#X], [plan_id=X], [shuffle_writer_type=hash] (89) ShuffleQueryStage Output [3]: [ps_suppkey#X, ps_availqty#X, ps_supplycost#X] @@ -549,7 +549,7 @@ Arguments: X, X (100) ColumnarExchange Input [4]: [hash_partition_key#X, ps_availqty#X, ps_supplycost#X, s_nationkey#X] -Arguments: hashpartitioning(s_nationkey#X, 1), ENSURE_REQUIREMENTS, [ps_availqty#X, ps_supplycost#X, s_nationkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(s_nationkey#X, 1), ENSURE_REQUIREMENTS, [ps_availqty#X, ps_supplycost#X, s_nationkey#X], [plan_id=X], [shuffle_writer_type=hash] (101) ShuffleQueryStage Output [3]: [ps_availqty#X, ps_supplycost#X, s_nationkey#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark34/12.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark34/12.txt index ffa8c4b4ac028..a5ff0ee4b58b9 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark34/12.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark34/12.txt @@ -74,7 +74,7 @@ Arguments: X, X (6) ColumnarExchange Input [3]: [hash_partition_key#X, o_orderkey#X, o_orderpriority#X] -Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [o_orderkey#X, o_orderpriority#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [o_orderkey#X, o_orderpriority#X], [plan_id=X], [shuffle_writer_type=hash] (7) ShuffleQueryStage Output [2]: [o_orderkey#X, o_orderpriority#X] @@ -111,7 +111,7 @@ Arguments: X, X (15) ColumnarExchange Input [3]: [hash_partition_key#X, l_orderkey#X, l_shipmode#X] -Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_shipmode#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_shipmode#X], [plan_id=X], [shuffle_writer_type=hash] (16) ShuffleQueryStage Output [2]: [l_orderkey#X, l_shipmode#X] @@ -154,7 +154,7 @@ Arguments: X, X (25) ColumnarExchange Input [4]: [hash_partition_key#X, l_shipmode#X, sum#X, sum#X] -Arguments: hashpartitioning(l_shipmode#X, 1), ENSURE_REQUIREMENTS, [l_shipmode#X, sum#X, sum#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(l_shipmode#X, 1), ENSURE_REQUIREMENTS, [l_shipmode#X, sum#X, sum#X], [plan_id=X], [shuffle_writer_type=hash] (26) ShuffleQueryStage Output [3]: [l_shipmode#X, sum#X, sum#X] @@ -183,7 +183,7 @@ Arguments: X, X (32) ColumnarExchange Input [3]: [l_shipmode#X, high_line_count#X, low_line_count#X] -Arguments: rangepartitioning(l_shipmode#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] +Arguments: rangepartitioning(l_shipmode#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [shuffle_writer_type=hash] (33) ShuffleQueryStage Output [3]: [l_shipmode#X, high_line_count#X, low_line_count#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark34/13.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark34/13.txt index a2da9210750e1..f0604101ed62c 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark34/13.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark34/13.txt @@ -72,7 +72,7 @@ Arguments: X, X (5) ColumnarExchange Input [2]: [hash_partition_key#X, c_custkey#X] -Arguments: hashpartitioning(c_custkey#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(c_custkey#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X], [plan_id=X], [shuffle_writer_type=hash] (6) ShuffleQueryStage Output [1]: [c_custkey#X] @@ -109,7 +109,7 @@ Arguments: X, X (14) ColumnarExchange Input [3]: [hash_partition_key#X, o_orderkey#X, o_custkey#X] -Arguments: hashpartitioning(o_custkey#X, 1), ENSURE_REQUIREMENTS, [o_orderkey#X, o_custkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(o_custkey#X, 1), ENSURE_REQUIREMENTS, [o_orderkey#X, o_custkey#X], [plan_id=X], [shuffle_writer_type=hash] (15) ShuffleQueryStage Output [2]: [o_orderkey#X, o_custkey#X] @@ -170,7 +170,7 @@ Arguments: X, X (27) ColumnarExchange Input [3]: [hash_partition_key#X, c_count#X, count#X] -Arguments: hashpartitioning(c_count#X, 1), ENSURE_REQUIREMENTS, [c_count#X, count#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(c_count#X, 1), ENSURE_REQUIREMENTS, [c_count#X, count#X], [plan_id=X], [shuffle_writer_type=hash] (28) ShuffleQueryStage Output [2]: [c_count#X, count#X] @@ -199,7 +199,7 @@ Arguments: X, X (34) ColumnarExchange Input [2]: [c_count#X, custdist#X] -Arguments: rangepartitioning(custdist#X DESC NULLS LAST, c_count#X DESC NULLS LAST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] +Arguments: rangepartitioning(custdist#X DESC NULLS LAST, c_count#X DESC NULLS LAST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [shuffle_writer_type=hash] (35) ShuffleQueryStage Output [2]: [c_count#X, custdist#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark34/14.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark34/14.txt index 9994b8328915c..768c18ddac04b 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark34/14.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark34/14.txt @@ -62,7 +62,7 @@ Arguments: X, X (6) ColumnarExchange Input [4]: [hash_partition_key#X, l_partkey#X, l_extendedprice#X, l_discount#X] -Arguments: hashpartitioning(l_partkey#X, 1), ENSURE_REQUIREMENTS, [l_partkey#X, l_extendedprice#X, l_discount#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(l_partkey#X, 1), ENSURE_REQUIREMENTS, [l_partkey#X, l_extendedprice#X, l_discount#X], [plan_id=X], [shuffle_writer_type=hash] (7) ShuffleQueryStage Output [3]: [l_partkey#X, l_extendedprice#X, l_discount#X] @@ -99,7 +99,7 @@ Arguments: X, X (15) ColumnarExchange Input [3]: [hash_partition_key#X, p_partkey#X, p_type#X] -Arguments: hashpartitioning(p_partkey#X, 1), ENSURE_REQUIREMENTS, [p_partkey#X, p_type#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(p_partkey#X, 1), ENSURE_REQUIREMENTS, [p_partkey#X, p_type#X], [plan_id=X], [shuffle_writer_type=hash] (16) ShuffleQueryStage Output [2]: [p_partkey#X, p_type#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark34/15.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark34/15.txt index 2c9849a2b1cf7..c91d8a7f50263 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark34/15.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark34/15.txt @@ -70,7 +70,7 @@ Arguments: X, X (6) ColumnarExchange Input [5]: [hash_partition_key#X, s_suppkey#X, s_name#X, s_address#X, s_phone#X] -Arguments: hashpartitioning(s_suppkey#X, 1), ENSURE_REQUIREMENTS, [s_suppkey#X, s_name#X, s_address#X, s_phone#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(s_suppkey#X, 1), ENSURE_REQUIREMENTS, [s_suppkey#X, s_name#X, s_address#X, s_phone#X], [plan_id=X], [shuffle_writer_type=hash] (7) ShuffleQueryStage Output [4]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X] @@ -118,7 +118,7 @@ Arguments: X, X (17) ColumnarExchange Input [4]: [hash_partition_key#X, l_suppkey#X, sum#X, isEmpty#X] -Arguments: hashpartitioning(l_suppkey#X, 1), ENSURE_REQUIREMENTS, [l_suppkey#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(l_suppkey#X, 1), ENSURE_REQUIREMENTS, [l_suppkey#X, sum#X, isEmpty#X], [plan_id=X], [shuffle_writer_type=hash] (18) ShuffleQueryStage Output [3]: [l_suppkey#X, sum#X, isEmpty#X] @@ -161,7 +161,7 @@ Arguments: X, X (27) ColumnarExchange Input [5]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X, total_revenue#X] -Arguments: rangepartitioning(s_suppkey#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] +Arguments: rangepartitioning(s_suppkey#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [shuffle_writer_type=hash] (28) ShuffleQueryStage Output [5]: [s_suppkey#X, s_name#X, s_address#X, s_phone#X, total_revenue#X] @@ -322,7 +322,7 @@ Arguments: X, X (55) ColumnarExchange Input [4]: [hash_partition_key#X, l_suppkey#X, sum#X, isEmpty#X] -Arguments: hashpartitioning(l_suppkey#X, 1), ENSURE_REQUIREMENTS, [l_suppkey#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(l_suppkey#X, 1), ENSURE_REQUIREMENTS, [l_suppkey#X, sum#X, isEmpty#X], [plan_id=X], [shuffle_writer_type=hash] (56) ShuffleQueryStage Output [3]: [l_suppkey#X, sum#X, isEmpty#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark34/16.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark34/16.txt index 4d7f8090a3b1c..501e4a122ebe7 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark34/16.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark34/16.txt @@ -88,7 +88,7 @@ Arguments: X, X (6) ColumnarExchange Input [3]: [hash_partition_key#X, ps_partkey#X, ps_suppkey#X] -Arguments: hashpartitioning(ps_partkey#X, 1), ENSURE_REQUIREMENTS, [ps_partkey#X, ps_suppkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(ps_partkey#X, 1), ENSURE_REQUIREMENTS, [ps_partkey#X, ps_suppkey#X], [plan_id=X], [shuffle_writer_type=hash] (7) ShuffleQueryStage Output [2]: [ps_partkey#X, ps_suppkey#X] @@ -125,7 +125,7 @@ Arguments: X, X (15) ColumnarExchange Input [5]: [hash_partition_key#X, p_partkey#X, p_brand#X, p_type#X, p_size#X] -Arguments: hashpartitioning(p_partkey#X, 1), ENSURE_REQUIREMENTS, [p_partkey#X, p_brand#X, p_type#X, p_size#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(p_partkey#X, 1), ENSURE_REQUIREMENTS, [p_partkey#X, p_brand#X, p_type#X, p_size#X], [plan_id=X], [shuffle_writer_type=hash] (16) ShuffleQueryStage Output [4]: [p_partkey#X, p_brand#X, p_type#X, p_size#X] @@ -168,7 +168,7 @@ Arguments: X, X (25) ColumnarExchange Input [5]: [hash_partition_key#X, p_brand#X, p_type#X, p_size#X, ps_suppkey#X] -Arguments: hashpartitioning(p_brand#X, p_type#X, p_size#X, ps_suppkey#X, 1), ENSURE_REQUIREMENTS, [p_brand#X, p_type#X, p_size#X, ps_suppkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(p_brand#X, p_type#X, p_size#X, ps_suppkey#X, 1), ENSURE_REQUIREMENTS, [p_brand#X, p_type#X, p_size#X, ps_suppkey#X], [plan_id=X], [shuffle_writer_type=hash] (26) ShuffleQueryStage Output [4]: [p_brand#X, p_type#X, p_size#X, ps_suppkey#X] @@ -208,7 +208,7 @@ Arguments: X, X (34) ColumnarExchange Input [5]: [hash_partition_key#X, p_brand#X, p_type#X, p_size#X, count#X] -Arguments: hashpartitioning(p_brand#X, p_type#X, p_size#X, 1), ENSURE_REQUIREMENTS, [p_brand#X, p_type#X, p_size#X, count#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(p_brand#X, p_type#X, p_size#X, 1), ENSURE_REQUIREMENTS, [p_brand#X, p_type#X, p_size#X, count#X], [plan_id=X], [shuffle_writer_type=hash] (35) ShuffleQueryStage Output [4]: [p_brand#X, p_type#X, p_size#X, count#X] @@ -237,7 +237,7 @@ Arguments: X, X (41) ColumnarExchange Input [4]: [p_brand#X, p_type#X, p_size#X, supplier_cnt#X] -Arguments: rangepartitioning(supplier_cnt#X DESC NULLS LAST, p_brand#X ASC NULLS FIRST, p_type#X ASC NULLS FIRST, p_size#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] +Arguments: rangepartitioning(supplier_cnt#X DESC NULLS LAST, p_brand#X ASC NULLS FIRST, p_type#X ASC NULLS FIRST, p_size#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [shuffle_writer_type=hash] (42) ShuffleQueryStage Output [4]: [p_brand#X, p_type#X, p_size#X, supplier_cnt#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark34/17.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark34/17.txt index aba53c7c1f324..b8d72004b1856 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark34/17.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark34/17.txt @@ -84,7 +84,7 @@ Arguments: X, X (6) ColumnarExchange Input [4]: [hash_partition_key#X, l_partkey#X, l_quantity#X, l_extendedprice#X] -Arguments: hashpartitioning(l_partkey#X, 1), ENSURE_REQUIREMENTS, [l_partkey#X, l_quantity#X, l_extendedprice#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(l_partkey#X, 1), ENSURE_REQUIREMENTS, [l_partkey#X, l_quantity#X, l_extendedprice#X], [plan_id=X], [shuffle_writer_type=hash] (7) ShuffleQueryStage Output [3]: [l_partkey#X, l_quantity#X, l_extendedprice#X] @@ -121,7 +121,7 @@ Arguments: X, X (15) ColumnarExchange Input [2]: [hash_partition_key#X, p_partkey#X] -Arguments: hashpartitioning(p_partkey#X, 1), ENSURE_REQUIREMENTS, [p_partkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(p_partkey#X, 1), ENSURE_REQUIREMENTS, [p_partkey#X], [plan_id=X], [shuffle_writer_type=hash] (16) ShuffleQueryStage Output [1]: [p_partkey#X] @@ -175,7 +175,7 @@ Arguments: X, X (27) ColumnarExchange Input [4]: [hash_partition_key#X, l_partkey#X, sum#X, count#X] -Arguments: hashpartitioning(l_partkey#X, 1), ENSURE_REQUIREMENTS, [l_partkey#X, sum#X, count#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(l_partkey#X, 1), ENSURE_REQUIREMENTS, [l_partkey#X, sum#X, count#X], [plan_id=X], [shuffle_writer_type=hash] (28) ShuffleQueryStage Output [3]: [l_partkey#X, sum#X, count#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark34/18.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark34/18.txt index 740918c259fc0..2cb8a12dd05ad 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark34/18.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark34/18.txt @@ -124,7 +124,7 @@ Arguments: X, X (6) ColumnarExchange Input [3]: [hash_partition_key#X, c_custkey#X, c_name#X] -Arguments: hashpartitioning(c_custkey#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X, c_name#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(c_custkey#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X, c_name#X], [plan_id=X], [shuffle_writer_type=hash] (7) ShuffleQueryStage Output [2]: [c_custkey#X, c_name#X] @@ -161,7 +161,7 @@ Arguments: X, X (15) ColumnarExchange Input [5]: [hash_partition_key#X, o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X] -Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X], [plan_id=X], [shuffle_writer_type=hash] (16) ShuffleQueryStage Output [4]: [o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X] @@ -200,7 +200,7 @@ Arguments: X, X (24) ColumnarExchange Input [4]: [hash_partition_key#X, l_orderkey#X, sum#X, isEmpty#X] -Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, sum#X, isEmpty#X], [plan_id=X], [shuffle_writer_type=hash] (25) ShuffleQueryStage Output [3]: [l_orderkey#X, sum#X, isEmpty#X] @@ -247,7 +247,7 @@ Arguments: X, X (35) ColumnarExchange Input [5]: [hash_partition_key#X, o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X] -Arguments: hashpartitioning(o_custkey#X, 1), ENSURE_REQUIREMENTS, [o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(o_custkey#X, 1), ENSURE_REQUIREMENTS, [o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X], [plan_id=X], [shuffle_writer_type=hash] (36) ShuffleQueryStage Output [4]: [o_orderkey#X, o_custkey#X, o_totalprice#X, o_orderdate#X] @@ -279,7 +279,7 @@ Arguments: X, X (43) ColumnarExchange Input [6]: [hash_partition_key#X, c_custkey#X, c_name#X, o_orderkey#X, o_totalprice#X, o_orderdate#X] -Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X, c_name#X, o_orderkey#X, o_totalprice#X, o_orderdate#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X, c_name#X, o_orderkey#X, o_totalprice#X, o_orderdate#X], [plan_id=X], [shuffle_writer_type=hash] (44) ShuffleQueryStage Output [5]: [c_custkey#X, c_name#X, o_orderkey#X, o_totalprice#X, o_orderdate#X] @@ -316,7 +316,7 @@ Arguments: X, X (52) ColumnarExchange Input [3]: [hash_partition_key#X, l_orderkey#X, l_quantity#X] -Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_quantity#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_quantity#X], [plan_id=X], [shuffle_writer_type=hash] (53) ShuffleQueryStage Output [2]: [l_orderkey#X, l_quantity#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark34/19.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark34/19.txt index 1acbbcee646f0..37cdd2a85b292 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark34/19.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark34/19.txt @@ -61,7 +61,7 @@ Arguments: X, X (6) ColumnarExchange Input [5]: [hash_partition_key#X, l_partkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] -Arguments: hashpartitioning(l_partkey#X, 1), ENSURE_REQUIREMENTS, [l_partkey#X, l_quantity#X, l_extendedprice#X, l_discount#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(l_partkey#X, 1), ENSURE_REQUIREMENTS, [l_partkey#X, l_quantity#X, l_extendedprice#X, l_discount#X], [plan_id=X], [shuffle_writer_type=hash] (7) ShuffleQueryStage Output [4]: [l_partkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] @@ -98,7 +98,7 @@ Arguments: X, X (15) ColumnarExchange Input [5]: [hash_partition_key#X, p_partkey#X, p_brand#X, p_size#X, p_container#X] -Arguments: hashpartitioning(p_partkey#X, 1), ENSURE_REQUIREMENTS, [p_partkey#X, p_brand#X, p_size#X, p_container#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(p_partkey#X, 1), ENSURE_REQUIREMENTS, [p_partkey#X, p_brand#X, p_size#X, p_container#X], [plan_id=X], [shuffle_writer_type=hash] (16) ShuffleQueryStage Output [4]: [p_partkey#X, p_brand#X, p_size#X, p_container#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark34/20.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark34/20.txt index 54b461c23e0f4..cbb0cdea34058 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark34/20.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark34/20.txt @@ -151,7 +151,7 @@ Arguments: X, X (6) ColumnarExchange Input [5]: [hash_partition_key#X, s_suppkey#X, s_name#X, s_address#X, s_nationkey#X] -Arguments: hashpartitioning(s_suppkey#X, 1), ENSURE_REQUIREMENTS, [s_suppkey#X, s_name#X, s_address#X, s_nationkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(s_suppkey#X, 1), ENSURE_REQUIREMENTS, [s_suppkey#X, s_name#X, s_address#X, s_nationkey#X], [plan_id=X], [shuffle_writer_type=hash] (7) ShuffleQueryStage Output [4]: [s_suppkey#X, s_name#X, s_address#X, s_nationkey#X] @@ -188,7 +188,7 @@ Arguments: X, X (15) ColumnarExchange Input [4]: [hash_partition_key#X, ps_partkey#X, ps_suppkey#X, ps_availqty#X] -Arguments: hashpartitioning(ps_partkey#X, 1), ENSURE_REQUIREMENTS, [ps_partkey#X, ps_suppkey#X, ps_availqty#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(ps_partkey#X, 1), ENSURE_REQUIREMENTS, [ps_partkey#X, ps_suppkey#X, ps_availqty#X], [plan_id=X], [shuffle_writer_type=hash] (16) ShuffleQueryStage Output [3]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X] @@ -225,7 +225,7 @@ Arguments: X, X (24) ColumnarExchange Input [2]: [hash_partition_key#X, p_partkey#X] -Arguments: hashpartitioning(p_partkey#X, 1), ENSURE_REQUIREMENTS, [p_partkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(p_partkey#X, 1), ENSURE_REQUIREMENTS, [p_partkey#X], [plan_id=X], [shuffle_writer_type=hash] (25) ShuffleQueryStage Output [1]: [p_partkey#X] @@ -257,7 +257,7 @@ Arguments: X, X (32) ColumnarExchange Input [4]: [hash_partition_key#X, ps_partkey#X, ps_suppkey#X, ps_availqty#X] -Arguments: hashpartitioning(ps_partkey#X, ps_suppkey#X, 1), ENSURE_REQUIREMENTS, [ps_partkey#X, ps_suppkey#X, ps_availqty#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(ps_partkey#X, ps_suppkey#X, 1), ENSURE_REQUIREMENTS, [ps_partkey#X, ps_suppkey#X, ps_availqty#X], [plan_id=X], [shuffle_writer_type=hash] (33) ShuffleQueryStage Output [3]: [ps_partkey#X, ps_suppkey#X, ps_availqty#X] @@ -294,7 +294,7 @@ Arguments: X, X (41) ColumnarExchange Input [4]: [hash_partition_key#X, l_partkey#X, l_suppkey#X, l_quantity#X] -Arguments: hashpartitioning(l_partkey#X, 1), ENSURE_REQUIREMENTS, [l_partkey#X, l_suppkey#X, l_quantity#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(l_partkey#X, 1), ENSURE_REQUIREMENTS, [l_partkey#X, l_suppkey#X, l_quantity#X], [plan_id=X], [shuffle_writer_type=hash] (42) ShuffleQueryStage Output [3]: [l_partkey#X, l_suppkey#X, l_quantity#X] @@ -361,7 +361,7 @@ Arguments: X, X (57) ColumnarExchange Input [4]: [hash_partition_key#X, (0.5 * sum(l_quantity))#X, l_partkey#X, l_suppkey#X] -Arguments: hashpartitioning(l_partkey#X, l_suppkey#X, 1), ENSURE_REQUIREMENTS, [(0.5 * sum(l_quantity))#X, l_partkey#X, l_suppkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(l_partkey#X, l_suppkey#X, 1), ENSURE_REQUIREMENTS, [(0.5 * sum(l_quantity))#X, l_partkey#X, l_suppkey#X], [plan_id=X], [shuffle_writer_type=hash] (58) ShuffleQueryStage Output [3]: [(0.5 * sum(l_quantity))#X, l_partkey#X, l_suppkey#X] @@ -393,7 +393,7 @@ Arguments: X, X (65) ColumnarExchange Input [2]: [hash_partition_key#X, ps_suppkey#X] -Arguments: hashpartitioning(ps_suppkey#X, 1), ENSURE_REQUIREMENTS, [ps_suppkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(ps_suppkey#X, 1), ENSURE_REQUIREMENTS, [ps_suppkey#X], [plan_id=X], [shuffle_writer_type=hash] (66) ShuffleQueryStage Output [1]: [ps_suppkey#X] @@ -425,7 +425,7 @@ Arguments: X, X (73) ColumnarExchange Input [4]: [hash_partition_key#X, s_name#X, s_address#X, s_nationkey#X] -Arguments: hashpartitioning(s_nationkey#X, 1), ENSURE_REQUIREMENTS, [s_name#X, s_address#X, s_nationkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(s_nationkey#X, 1), ENSURE_REQUIREMENTS, [s_name#X, s_address#X, s_nationkey#X], [plan_id=X], [shuffle_writer_type=hash] (74) ShuffleQueryStage Output [3]: [s_name#X, s_address#X, s_nationkey#X] @@ -462,7 +462,7 @@ Arguments: X, X (82) ColumnarExchange Input [2]: [hash_partition_key#X, n_nationkey#X] -Arguments: hashpartitioning(n_nationkey#X, 1), ENSURE_REQUIREMENTS, [n_nationkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(n_nationkey#X, 1), ENSURE_REQUIREMENTS, [n_nationkey#X], [plan_id=X], [shuffle_writer_type=hash] (83) ShuffleQueryStage Output [1]: [n_nationkey#X] @@ -494,7 +494,7 @@ Arguments: X, X (90) ColumnarExchange Input [2]: [s_name#X, s_address#X] -Arguments: rangepartitioning(s_name#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] +Arguments: rangepartitioning(s_name#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [shuffle_writer_type=hash] (91) ShuffleQueryStage Output [2]: [s_name#X, s_address#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark34/21.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark34/21.txt index e752cdadeec94..5ec8fe9d155bf 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark34/21.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark34/21.txt @@ -144,7 +144,7 @@ Arguments: X, X (6) ColumnarExchange Input [4]: [hash_partition_key#X, s_suppkey#X, s_name#X, s_nationkey#X] -Arguments: hashpartitioning(s_suppkey#X, 1), ENSURE_REQUIREMENTS, [s_suppkey#X, s_name#X, s_nationkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(s_suppkey#X, 1), ENSURE_REQUIREMENTS, [s_suppkey#X, s_name#X, s_nationkey#X], [plan_id=X], [shuffle_writer_type=hash] (7) ShuffleQueryStage Output [3]: [s_suppkey#X, s_name#X, s_nationkey#X] @@ -181,7 +181,7 @@ Arguments: X, X (15) ColumnarExchange Input [3]: [hash_partition_key#X, l_orderkey#X, l_suppkey#X] -Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_suppkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_suppkey#X], [plan_id=X], [shuffle_writer_type=hash] (16) ShuffleQueryStage Output [2]: [l_orderkey#X, l_suppkey#X] @@ -213,7 +213,7 @@ Arguments: X, X (23) ColumnarExchange Input [3]: [hash_partition_key#X, l_orderkey#X, l_suppkey#X] -Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_suppkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_suppkey#X], [plan_id=X], [shuffle_writer_type=hash] (24) ShuffleQueryStage Output [2]: [l_orderkey#X, l_suppkey#X] @@ -256,7 +256,7 @@ Arguments: X, X (33) ColumnarExchange Input [3]: [hash_partition_key#X, l_orderkey#X, l_suppkey#X] -Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_suppkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_suppkey#X], [plan_id=X], [shuffle_writer_type=hash] (34) ShuffleQueryStage Output [2]: [l_orderkey#X, l_suppkey#X] @@ -288,7 +288,7 @@ Arguments: X, X (41) ColumnarExchange Input [3]: [hash_partition_key#X, l_orderkey#X, l_suppkey#X] -Arguments: hashpartitioning(l_suppkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_suppkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(l_suppkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_suppkey#X], [plan_id=X], [shuffle_writer_type=hash] (42) ShuffleQueryStage Output [2]: [l_orderkey#X, l_suppkey#X] @@ -320,7 +320,7 @@ Arguments: X, X (49) ColumnarExchange Input [4]: [hash_partition_key#X, s_name#X, s_nationkey#X, l_orderkey#X] -Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [s_name#X, s_nationkey#X, l_orderkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [s_name#X, s_nationkey#X, l_orderkey#X], [plan_id=X], [shuffle_writer_type=hash] (50) ShuffleQueryStage Output [3]: [s_name#X, s_nationkey#X, l_orderkey#X] @@ -357,7 +357,7 @@ Arguments: X, X (58) ColumnarExchange Input [2]: [hash_partition_key#X, o_orderkey#X] -Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [o_orderkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [o_orderkey#X], [plan_id=X], [shuffle_writer_type=hash] (59) ShuffleQueryStage Output [1]: [o_orderkey#X] @@ -389,7 +389,7 @@ Arguments: X, X (66) ColumnarExchange Input [3]: [hash_partition_key#X, s_name#X, s_nationkey#X] -Arguments: hashpartitioning(s_nationkey#X, 1), ENSURE_REQUIREMENTS, [s_name#X, s_nationkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(s_nationkey#X, 1), ENSURE_REQUIREMENTS, [s_name#X, s_nationkey#X], [plan_id=X], [shuffle_writer_type=hash] (67) ShuffleQueryStage Output [2]: [s_name#X, s_nationkey#X] @@ -426,7 +426,7 @@ Arguments: X, X (75) ColumnarExchange Input [2]: [hash_partition_key#X, n_nationkey#X] -Arguments: hashpartitioning(n_nationkey#X, 1), ENSURE_REQUIREMENTS, [n_nationkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(n_nationkey#X, 1), ENSURE_REQUIREMENTS, [n_nationkey#X], [plan_id=X], [shuffle_writer_type=hash] (76) ShuffleQueryStage Output [1]: [n_nationkey#X] @@ -469,7 +469,7 @@ Arguments: X, X (85) ColumnarExchange Input [3]: [hash_partition_key#X, s_name#X, count#X] -Arguments: hashpartitioning(s_name#X, 1), ENSURE_REQUIREMENTS, [s_name#X, count#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(s_name#X, 1), ENSURE_REQUIREMENTS, [s_name#X, count#X], [plan_id=X], [shuffle_writer_type=hash] (86) ShuffleQueryStage Output [2]: [s_name#X, count#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark34/22.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark34/22.txt index f4597ae85ae23..fee21bac69fe5 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark34/22.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark34/22.txt @@ -71,7 +71,7 @@ Arguments: X, X (6) ColumnarExchange Input [4]: [hash_partition_key#X, c_custkey#X, c_phone#X, c_acctbal#X] -Arguments: hashpartitioning(c_custkey#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X, c_phone#X, c_acctbal#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(c_custkey#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X, c_phone#X, c_acctbal#X], [plan_id=X], [shuffle_writer_type=hash] (7) ShuffleQueryStage Output [3]: [c_custkey#X, c_phone#X, c_acctbal#X] @@ -103,7 +103,7 @@ Arguments: X, X (14) ColumnarExchange Input [2]: [hash_partition_key#X, o_custkey#X] -Arguments: hashpartitioning(o_custkey#X, 1), ENSURE_REQUIREMENTS, [o_custkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(o_custkey#X, 1), ENSURE_REQUIREMENTS, [o_custkey#X], [plan_id=X], [shuffle_writer_type=hash] (15) ShuffleQueryStage Output [1]: [o_custkey#X] @@ -146,7 +146,7 @@ Arguments: X, X (24) ColumnarExchange Input [5]: [hash_partition_key#X, cntrycode#X, count#X, sum#X, isEmpty#X] -Arguments: hashpartitioning(cntrycode#X, 1), ENSURE_REQUIREMENTS, [cntrycode#X, count#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(cntrycode#X, 1), ENSURE_REQUIREMENTS, [cntrycode#X, count#X, sum#X, isEmpty#X], [plan_id=X], [shuffle_writer_type=hash] (25) ShuffleQueryStage Output [4]: [cntrycode#X, count#X, sum#X, isEmpty#X] @@ -175,7 +175,7 @@ Arguments: X, X (31) ColumnarExchange Input [3]: [cntrycode#X, numcust#X, totacctbal#X] -Arguments: rangepartitioning(cntrycode#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] +Arguments: rangepartitioning(cntrycode#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [shuffle_writer_type=hash] (32) ShuffleQueryStage Output [3]: [cntrycode#X, numcust#X, totacctbal#X] @@ -327,7 +327,7 @@ Arguments: X, X (59) ColumnarExchange Input [2]: [sum#X, count#X] -Arguments: SinglePartition, ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] +Arguments: SinglePartition, ENSURE_REQUIREMENTS, [plan_id=X], [shuffle_writer_type=hash] (60) ShuffleQueryStage Output [2]: [sum#X, count#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark34/3.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark34/3.txt index 6ae748ed5b026..83064ad3a01db 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark34/3.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark34/3.txt @@ -86,7 +86,7 @@ Arguments: X, X (6) ColumnarExchange Input [2]: [hash_partition_key#X, c_custkey#X] -Arguments: hashpartitioning(c_custkey#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(c_custkey#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X], [plan_id=X], [shuffle_writer_type=hash] (7) ShuffleQueryStage Output [1]: [c_custkey#X] @@ -123,7 +123,7 @@ Arguments: X, X (15) ColumnarExchange Input [5]: [hash_partition_key#X, o_orderkey#X, o_custkey#X, o_orderdate#X, o_shippriority#X] -Arguments: hashpartitioning(o_custkey#X, 1), ENSURE_REQUIREMENTS, [o_orderkey#X, o_custkey#X, o_orderdate#X, o_shippriority#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(o_custkey#X, 1), ENSURE_REQUIREMENTS, [o_orderkey#X, o_custkey#X, o_orderdate#X, o_shippriority#X], [plan_id=X], [shuffle_writer_type=hash] (16) ShuffleQueryStage Output [4]: [o_orderkey#X, o_custkey#X, o_orderdate#X, o_shippriority#X] @@ -155,7 +155,7 @@ Arguments: X, X (23) ColumnarExchange Input [4]: [hash_partition_key#X, o_orderkey#X, o_orderdate#X, o_shippriority#X] -Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [o_orderkey#X, o_orderdate#X, o_shippriority#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [o_orderkey#X, o_orderdate#X, o_shippriority#X], [plan_id=X], [shuffle_writer_type=hash] (24) ShuffleQueryStage Output [3]: [o_orderkey#X, o_orderdate#X, o_shippriority#X] @@ -192,7 +192,7 @@ Arguments: X, X (32) ColumnarExchange Input [4]: [hash_partition_key#X, l_orderkey#X, l_extendedprice#X, l_discount#X] -Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_extendedprice#X, l_discount#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_extendedprice#X, l_discount#X], [plan_id=X], [shuffle_writer_type=hash] (33) ShuffleQueryStage Output [3]: [l_orderkey#X, l_extendedprice#X, l_discount#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark34/4.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark34/4.txt index a51a10913f363..3f055feb01b68 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark34/4.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark34/4.txt @@ -75,7 +75,7 @@ Arguments: X, X (6) ColumnarExchange Input [3]: [hash_partition_key#X, o_orderkey#X, o_orderpriority#X] -Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [o_orderkey#X, o_orderpriority#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [o_orderkey#X, o_orderpriority#X], [plan_id=X], [shuffle_writer_type=hash] (7) ShuffleQueryStage Output [2]: [o_orderkey#X, o_orderpriority#X] @@ -112,7 +112,7 @@ Arguments: X, X (15) ColumnarExchange Input [2]: [hash_partition_key#X, l_orderkey#X] -Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X], [plan_id=X], [shuffle_writer_type=hash] (16) ShuffleQueryStage Output [1]: [l_orderkey#X] @@ -155,7 +155,7 @@ Arguments: X, X (25) ColumnarExchange Input [3]: [hash_partition_key#X, o_orderpriority#X, count#X] -Arguments: hashpartitioning(o_orderpriority#X, 1), ENSURE_REQUIREMENTS, [o_orderpriority#X, count#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(o_orderpriority#X, 1), ENSURE_REQUIREMENTS, [o_orderpriority#X, count#X], [plan_id=X], [shuffle_writer_type=hash] (26) ShuffleQueryStage Output [2]: [o_orderpriority#X, count#X] @@ -184,7 +184,7 @@ Arguments: X, X (32) ColumnarExchange Input [2]: [o_orderpriority#X, order_count#X] -Arguments: rangepartitioning(o_orderpriority#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] +Arguments: rangepartitioning(o_orderpriority#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [shuffle_writer_type=hash] (33) ShuffleQueryStage Output [2]: [o_orderpriority#X, order_count#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark34/5.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark34/5.txt index 15425399ceb8e..87eaa9b8dce6f 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark34/5.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark34/5.txt @@ -159,7 +159,7 @@ Arguments: X, X (6) ColumnarExchange Input [3]: [hash_partition_key#X, c_custkey#X, c_nationkey#X] -Arguments: hashpartitioning(c_custkey#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X, c_nationkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(c_custkey#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X, c_nationkey#X], [plan_id=X], [shuffle_writer_type=hash] (7) ShuffleQueryStage Output [2]: [c_custkey#X, c_nationkey#X] @@ -196,7 +196,7 @@ Arguments: X, X (15) ColumnarExchange Input [3]: [hash_partition_key#X, o_orderkey#X, o_custkey#X] -Arguments: hashpartitioning(o_custkey#X, 1), ENSURE_REQUIREMENTS, [o_orderkey#X, o_custkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(o_custkey#X, 1), ENSURE_REQUIREMENTS, [o_orderkey#X, o_custkey#X], [plan_id=X], [shuffle_writer_type=hash] (16) ShuffleQueryStage Output [2]: [o_orderkey#X, o_custkey#X] @@ -228,7 +228,7 @@ Arguments: X, X (23) ColumnarExchange Input [3]: [hash_partition_key#X, c_nationkey#X, o_orderkey#X] -Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [c_nationkey#X, o_orderkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [c_nationkey#X, o_orderkey#X], [plan_id=X], [shuffle_writer_type=hash] (24) ShuffleQueryStage Output [2]: [c_nationkey#X, o_orderkey#X] @@ -265,7 +265,7 @@ Arguments: X, X (32) ColumnarExchange Input [5]: [hash_partition_key#X, l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] -Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X], [plan_id=X], [shuffle_writer_type=hash] (33) ShuffleQueryStage Output [4]: [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] @@ -297,7 +297,7 @@ Arguments: X, X (40) ColumnarExchange Input [5]: [hash_partition_key#X, c_nationkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] -Arguments: hashpartitioning(l_suppkey#X, c_nationkey#X, 1), ENSURE_REQUIREMENTS, [c_nationkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(l_suppkey#X, c_nationkey#X, 1), ENSURE_REQUIREMENTS, [c_nationkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X], [plan_id=X], [shuffle_writer_type=hash] (41) ShuffleQueryStage Output [4]: [c_nationkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] @@ -334,7 +334,7 @@ Arguments: X, X (49) ColumnarExchange Input [3]: [hash_partition_key#X, s_suppkey#X, s_nationkey#X] -Arguments: hashpartitioning(s_suppkey#X, s_nationkey#X, 1), ENSURE_REQUIREMENTS, [s_suppkey#X, s_nationkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(s_suppkey#X, s_nationkey#X, 1), ENSURE_REQUIREMENTS, [s_suppkey#X, s_nationkey#X], [plan_id=X], [shuffle_writer_type=hash] (50) ShuffleQueryStage Output [2]: [s_suppkey#X, s_nationkey#X] @@ -366,7 +366,7 @@ Arguments: X, X (57) ColumnarExchange Input [4]: [hash_partition_key#X, l_extendedprice#X, l_discount#X, s_nationkey#X] -Arguments: hashpartitioning(s_nationkey#X, 1), ENSURE_REQUIREMENTS, [l_extendedprice#X, l_discount#X, s_nationkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(s_nationkey#X, 1), ENSURE_REQUIREMENTS, [l_extendedprice#X, l_discount#X, s_nationkey#X], [plan_id=X], [shuffle_writer_type=hash] (58) ShuffleQueryStage Output [3]: [l_extendedprice#X, l_discount#X, s_nationkey#X] @@ -403,7 +403,7 @@ Arguments: X, X (66) ColumnarExchange Input [4]: [hash_partition_key#X, n_nationkey#X, n_name#X, n_regionkey#X] -Arguments: hashpartitioning(n_nationkey#X, 1), ENSURE_REQUIREMENTS, [n_nationkey#X, n_name#X, n_regionkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(n_nationkey#X, 1), ENSURE_REQUIREMENTS, [n_nationkey#X, n_name#X, n_regionkey#X], [plan_id=X], [shuffle_writer_type=hash] (67) ShuffleQueryStage Output [3]: [n_nationkey#X, n_name#X, n_regionkey#X] @@ -435,7 +435,7 @@ Arguments: X, X (74) ColumnarExchange Input [5]: [hash_partition_key#X, l_extendedprice#X, l_discount#X, n_name#X, n_regionkey#X] -Arguments: hashpartitioning(n_regionkey#X, 1), ENSURE_REQUIREMENTS, [l_extendedprice#X, l_discount#X, n_name#X, n_regionkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(n_regionkey#X, 1), ENSURE_REQUIREMENTS, [l_extendedprice#X, l_discount#X, n_name#X, n_regionkey#X], [plan_id=X], [shuffle_writer_type=hash] (75) ShuffleQueryStage Output [4]: [l_extendedprice#X, l_discount#X, n_name#X, n_regionkey#X] @@ -472,7 +472,7 @@ Arguments: X, X (83) ColumnarExchange Input [2]: [hash_partition_key#X, r_regionkey#X] -Arguments: hashpartitioning(r_regionkey#X, 1), ENSURE_REQUIREMENTS, [r_regionkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(r_regionkey#X, 1), ENSURE_REQUIREMENTS, [r_regionkey#X], [plan_id=X], [shuffle_writer_type=hash] (84) ShuffleQueryStage Output [1]: [r_regionkey#X] @@ -515,7 +515,7 @@ Arguments: X, X (93) ColumnarExchange Input [4]: [hash_partition_key#X, n_name#X, sum#X, isEmpty#X] -Arguments: hashpartitioning(n_name#X, 1), ENSURE_REQUIREMENTS, [n_name#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(n_name#X, 1), ENSURE_REQUIREMENTS, [n_name#X, sum#X, isEmpty#X], [plan_id=X], [shuffle_writer_type=hash] (94) ShuffleQueryStage Output [3]: [n_name#X, sum#X, isEmpty#X] @@ -544,7 +544,7 @@ Arguments: X, X (100) ColumnarExchange Input [2]: [n_name#X, revenue#X] -Arguments: rangepartitioning(revenue#X DESC NULLS LAST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] +Arguments: rangepartitioning(revenue#X DESC NULLS LAST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [shuffle_writer_type=hash] (101) ShuffleQueryStage Output [2]: [n_name#X, revenue#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark34/6.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark34/6.txt index 9463fa1da9d93..fa37c656a1aac 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark34/6.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark34/6.txt @@ -52,7 +52,7 @@ Arguments: X, X (7) ColumnarExchange Input [2]: [sum#X, isEmpty#X] -Arguments: SinglePartition, ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] +Arguments: SinglePartition, ENSURE_REQUIREMENTS, [plan_id=X], [shuffle_writer_type=hash] (8) ShuffleQueryStage Output [2]: [sum#X, isEmpty#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark34/7.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark34/7.txt index 2960110d5b70e..4f781dd6d07ed 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark34/7.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark34/7.txt @@ -153,7 +153,7 @@ Arguments: X, X (6) ColumnarExchange Input [3]: [hash_partition_key#X, s_suppkey#X, s_nationkey#X] -Arguments: hashpartitioning(s_suppkey#X, 1), ENSURE_REQUIREMENTS, [s_suppkey#X, s_nationkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(s_suppkey#X, 1), ENSURE_REQUIREMENTS, [s_suppkey#X, s_nationkey#X], [plan_id=X], [shuffle_writer_type=hash] (7) ShuffleQueryStage Output [2]: [s_suppkey#X, s_nationkey#X] @@ -190,7 +190,7 @@ Arguments: X, X (15) ColumnarExchange Input [6]: [hash_partition_key#X, l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] -Arguments: hashpartitioning(l_suppkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(l_suppkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X], [plan_id=X], [shuffle_writer_type=hash] (16) ShuffleQueryStage Output [5]: [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] @@ -222,7 +222,7 @@ Arguments: X, X (23) ColumnarExchange Input [6]: [hash_partition_key#X, s_nationkey#X, l_orderkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] -Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [s_nationkey#X, l_orderkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [s_nationkey#X, l_orderkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X], [plan_id=X], [shuffle_writer_type=hash] (24) ShuffleQueryStage Output [5]: [s_nationkey#X, l_orderkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X] @@ -259,7 +259,7 @@ Arguments: X, X (32) ColumnarExchange Input [3]: [hash_partition_key#X, o_orderkey#X, o_custkey#X] -Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [o_orderkey#X, o_custkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [o_orderkey#X, o_custkey#X], [plan_id=X], [shuffle_writer_type=hash] (33) ShuffleQueryStage Output [2]: [o_orderkey#X, o_custkey#X] @@ -291,7 +291,7 @@ Arguments: X, X (40) ColumnarExchange Input [6]: [hash_partition_key#X, s_nationkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, o_custkey#X] -Arguments: hashpartitioning(o_custkey#X, 1), ENSURE_REQUIREMENTS, [s_nationkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, o_custkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(o_custkey#X, 1), ENSURE_REQUIREMENTS, [s_nationkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, o_custkey#X], [plan_id=X], [shuffle_writer_type=hash] (41) ShuffleQueryStage Output [5]: [s_nationkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, o_custkey#X] @@ -328,7 +328,7 @@ Arguments: X, X (49) ColumnarExchange Input [3]: [hash_partition_key#X, c_custkey#X, c_nationkey#X] -Arguments: hashpartitioning(c_custkey#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X, c_nationkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(c_custkey#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X, c_nationkey#X], [plan_id=X], [shuffle_writer_type=hash] (50) ShuffleQueryStage Output [2]: [c_custkey#X, c_nationkey#X] @@ -360,7 +360,7 @@ Arguments: X, X (57) ColumnarExchange Input [6]: [hash_partition_key#X, s_nationkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, c_nationkey#X] -Arguments: hashpartitioning(s_nationkey#X, 1), ENSURE_REQUIREMENTS, [s_nationkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, c_nationkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(s_nationkey#X, 1), ENSURE_REQUIREMENTS, [s_nationkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, c_nationkey#X], [plan_id=X], [shuffle_writer_type=hash] (58) ShuffleQueryStage Output [5]: [s_nationkey#X, l_extendedprice#X, l_discount#X, l_shipdate#X, c_nationkey#X] @@ -397,7 +397,7 @@ Arguments: X, X (66) ColumnarExchange Input [3]: [hash_partition_key#X, n_nationkey#X, n_name#X] -Arguments: hashpartitioning(n_nationkey#X, 1), ENSURE_REQUIREMENTS, [n_nationkey#X, n_name#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(n_nationkey#X, 1), ENSURE_REQUIREMENTS, [n_nationkey#X, n_name#X], [plan_id=X], [shuffle_writer_type=hash] (67) ShuffleQueryStage Output [2]: [n_nationkey#X, n_name#X] @@ -429,7 +429,7 @@ Arguments: X, X (74) ColumnarExchange Input [6]: [hash_partition_key#X, l_extendedprice#X, l_discount#X, l_shipdate#X, c_nationkey#X, n_name#X] -Arguments: hashpartitioning(c_nationkey#X, 1), ENSURE_REQUIREMENTS, [l_extendedprice#X, l_discount#X, l_shipdate#X, c_nationkey#X, n_name#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(c_nationkey#X, 1), ENSURE_REQUIREMENTS, [l_extendedprice#X, l_discount#X, l_shipdate#X, c_nationkey#X, n_name#X], [plan_id=X], [shuffle_writer_type=hash] (75) ShuffleQueryStage Output [5]: [l_extendedprice#X, l_discount#X, l_shipdate#X, c_nationkey#X, n_name#X] @@ -485,7 +485,7 @@ Arguments: X, X (88) ColumnarExchange Input [6]: [hash_partition_key#X, supp_nation#X, cust_nation#X, l_year#X, sum#X, isEmpty#X] -Arguments: hashpartitioning(supp_nation#X, cust_nation#X, l_year#X, 1), ENSURE_REQUIREMENTS, [supp_nation#X, cust_nation#X, l_year#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(supp_nation#X, cust_nation#X, l_year#X, 1), ENSURE_REQUIREMENTS, [supp_nation#X, cust_nation#X, l_year#X, sum#X, isEmpty#X], [plan_id=X], [shuffle_writer_type=hash] (89) ShuffleQueryStage Output [5]: [supp_nation#X, cust_nation#X, l_year#X, sum#X, isEmpty#X] @@ -514,7 +514,7 @@ Arguments: X, X (95) ColumnarExchange Input [4]: [supp_nation#X, cust_nation#X, l_year#X, revenue#X] -Arguments: rangepartitioning(supp_nation#X ASC NULLS FIRST, cust_nation#X ASC NULLS FIRST, l_year#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] +Arguments: rangepartitioning(supp_nation#X ASC NULLS FIRST, cust_nation#X ASC NULLS FIRST, l_year#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [shuffle_writer_type=hash] (96) ShuffleQueryStage Output [4]: [supp_nation#X, cust_nation#X, l_year#X, revenue#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark34/8.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark34/8.txt index 3cacef9cdd5e1..ae71cca0e43cc 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark34/8.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark34/8.txt @@ -202,7 +202,7 @@ Arguments: X, X (6) ColumnarExchange Input [2]: [hash_partition_key#X, p_partkey#X] -Arguments: hashpartitioning(p_partkey#X, 1), ENSURE_REQUIREMENTS, [p_partkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(p_partkey#X, 1), ENSURE_REQUIREMENTS, [p_partkey#X], [plan_id=X], [shuffle_writer_type=hash] (7) ShuffleQueryStage Output [1]: [p_partkey#X] @@ -239,7 +239,7 @@ Arguments: X, X (15) ColumnarExchange Input [6]: [hash_partition_key#X, l_orderkey#X, l_partkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] -Arguments: hashpartitioning(l_partkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_partkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(l_partkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_partkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X], [plan_id=X], [shuffle_writer_type=hash] (16) ShuffleQueryStage Output [5]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] @@ -271,7 +271,7 @@ Arguments: X, X (23) ColumnarExchange Input [5]: [hash_partition_key#X, l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] -Arguments: hashpartitioning(l_suppkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(l_suppkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X], [plan_id=X], [shuffle_writer_type=hash] (24) ShuffleQueryStage Output [4]: [l_orderkey#X, l_suppkey#X, l_extendedprice#X, l_discount#X] @@ -308,7 +308,7 @@ Arguments: X, X (32) ColumnarExchange Input [3]: [hash_partition_key#X, s_suppkey#X, s_nationkey#X] -Arguments: hashpartitioning(s_suppkey#X, 1), ENSURE_REQUIREMENTS, [s_suppkey#X, s_nationkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(s_suppkey#X, 1), ENSURE_REQUIREMENTS, [s_suppkey#X, s_nationkey#X], [plan_id=X], [shuffle_writer_type=hash] (33) ShuffleQueryStage Output [2]: [s_suppkey#X, s_nationkey#X] @@ -340,7 +340,7 @@ Arguments: X, X (40) ColumnarExchange Input [5]: [hash_partition_key#X, l_orderkey#X, l_extendedprice#X, l_discount#X, s_nationkey#X] -Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_extendedprice#X, l_discount#X, s_nationkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_extendedprice#X, l_discount#X, s_nationkey#X], [plan_id=X], [shuffle_writer_type=hash] (41) ShuffleQueryStage Output [4]: [l_orderkey#X, l_extendedprice#X, l_discount#X, s_nationkey#X] @@ -377,7 +377,7 @@ Arguments: X, X (49) ColumnarExchange Input [4]: [hash_partition_key#X, o_orderkey#X, o_custkey#X, o_orderdate#X] -Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [o_orderkey#X, o_custkey#X, o_orderdate#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [o_orderkey#X, o_custkey#X, o_orderdate#X], [plan_id=X], [shuffle_writer_type=hash] (50) ShuffleQueryStage Output [3]: [o_orderkey#X, o_custkey#X, o_orderdate#X] @@ -409,7 +409,7 @@ Arguments: X, X (57) ColumnarExchange Input [6]: [hash_partition_key#X, l_extendedprice#X, l_discount#X, s_nationkey#X, o_custkey#X, o_orderdate#X] -Arguments: hashpartitioning(o_custkey#X, 1), ENSURE_REQUIREMENTS, [l_extendedprice#X, l_discount#X, s_nationkey#X, o_custkey#X, o_orderdate#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(o_custkey#X, 1), ENSURE_REQUIREMENTS, [l_extendedprice#X, l_discount#X, s_nationkey#X, o_custkey#X, o_orderdate#X], [plan_id=X], [shuffle_writer_type=hash] (58) ShuffleQueryStage Output [5]: [l_extendedprice#X, l_discount#X, s_nationkey#X, o_custkey#X, o_orderdate#X] @@ -446,7 +446,7 @@ Arguments: X, X (66) ColumnarExchange Input [3]: [hash_partition_key#X, c_custkey#X, c_nationkey#X] -Arguments: hashpartitioning(c_custkey#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X, c_nationkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(c_custkey#X, 1), ENSURE_REQUIREMENTS, [c_custkey#X, c_nationkey#X], [plan_id=X], [shuffle_writer_type=hash] (67) ShuffleQueryStage Output [2]: [c_custkey#X, c_nationkey#X] @@ -478,7 +478,7 @@ Arguments: X, X (74) ColumnarExchange Input [6]: [hash_partition_key#X, l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderdate#X, c_nationkey#X] -Arguments: hashpartitioning(c_nationkey#X, 1), ENSURE_REQUIREMENTS, [l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderdate#X, c_nationkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(c_nationkey#X, 1), ENSURE_REQUIREMENTS, [l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderdate#X, c_nationkey#X], [plan_id=X], [shuffle_writer_type=hash] (75) ShuffleQueryStage Output [5]: [l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderdate#X, c_nationkey#X] @@ -515,7 +515,7 @@ Arguments: X, X (83) ColumnarExchange Input [3]: [hash_partition_key#X, n_nationkey#X, n_regionkey#X] -Arguments: hashpartitioning(n_nationkey#X, 1), ENSURE_REQUIREMENTS, [n_nationkey#X, n_regionkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(n_nationkey#X, 1), ENSURE_REQUIREMENTS, [n_nationkey#X, n_regionkey#X], [plan_id=X], [shuffle_writer_type=hash] (84) ShuffleQueryStage Output [2]: [n_nationkey#X, n_regionkey#X] @@ -547,7 +547,7 @@ Arguments: X, X (91) ColumnarExchange Input [6]: [hash_partition_key#X, l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderdate#X, n_regionkey#X] -Arguments: hashpartitioning(s_nationkey#X, 1), ENSURE_REQUIREMENTS, [l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderdate#X, n_regionkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(s_nationkey#X, 1), ENSURE_REQUIREMENTS, [l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderdate#X, n_regionkey#X], [plan_id=X], [shuffle_writer_type=hash] (92) ShuffleQueryStage Output [5]: [l_extendedprice#X, l_discount#X, s_nationkey#X, o_orderdate#X, n_regionkey#X] @@ -584,7 +584,7 @@ Arguments: X, X (100) ColumnarExchange Input [3]: [hash_partition_key#X, n_nationkey#X, n_name#X] -Arguments: hashpartitioning(n_nationkey#X, 1), ENSURE_REQUIREMENTS, [n_nationkey#X, n_name#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(n_nationkey#X, 1), ENSURE_REQUIREMENTS, [n_nationkey#X, n_name#X], [plan_id=X], [shuffle_writer_type=hash] (101) ShuffleQueryStage Output [2]: [n_nationkey#X, n_name#X] @@ -616,7 +616,7 @@ Arguments: X, X (108) ColumnarExchange Input [6]: [hash_partition_key#X, l_extendedprice#X, l_discount#X, o_orderdate#X, n_regionkey#X, n_name#X] -Arguments: hashpartitioning(n_regionkey#X, 1), ENSURE_REQUIREMENTS, [l_extendedprice#X, l_discount#X, o_orderdate#X, n_regionkey#X, n_name#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(n_regionkey#X, 1), ENSURE_REQUIREMENTS, [l_extendedprice#X, l_discount#X, o_orderdate#X, n_regionkey#X, n_name#X], [plan_id=X], [shuffle_writer_type=hash] (109) ShuffleQueryStage Output [5]: [l_extendedprice#X, l_discount#X, o_orderdate#X, n_regionkey#X, n_name#X] @@ -653,7 +653,7 @@ Arguments: X, X (117) ColumnarExchange Input [2]: [hash_partition_key#X, r_regionkey#X] -Arguments: hashpartitioning(r_regionkey#X, 1), ENSURE_REQUIREMENTS, [r_regionkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(r_regionkey#X, 1), ENSURE_REQUIREMENTS, [r_regionkey#X], [plan_id=X], [shuffle_writer_type=hash] (118) ShuffleQueryStage Output [1]: [r_regionkey#X] @@ -696,7 +696,7 @@ Arguments: X, X (127) ColumnarExchange Input [6]: [hash_partition_key#X, o_year#X, sum#X, isEmpty#X, sum#X, isEmpty#X] -Arguments: hashpartitioning(o_year#X, 1), ENSURE_REQUIREMENTS, [o_year#X, sum#X, isEmpty#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(o_year#X, 1), ENSURE_REQUIREMENTS, [o_year#X, sum#X, isEmpty#X, sum#X, isEmpty#X], [plan_id=X], [shuffle_writer_type=hash] (128) ShuffleQueryStage Output [5]: [o_year#X, sum#X, isEmpty#X, sum#X, isEmpty#X] @@ -729,7 +729,7 @@ Arguments: X, X (135) ColumnarExchange Input [2]: [o_year#X, mkt_share#X] -Arguments: rangepartitioning(o_year#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] +Arguments: rangepartitioning(o_year#X ASC NULLS FIRST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [shuffle_writer_type=hash] (136) ShuffleQueryStage Output [2]: [o_year#X, mkt_share#X] diff --git a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark34/9.txt b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark34/9.txt index 61b6401046abf..a26c30f45f857 100644 --- a/backends-velox/src/test/resources/tpch-approved-plan/v1/spark34/9.txt +++ b/backends-velox/src/test/resources/tpch-approved-plan/v1/spark34/9.txt @@ -158,7 +158,7 @@ Arguments: X, X (6) ColumnarExchange Input [2]: [hash_partition_key#X, p_partkey#X] -Arguments: hashpartitioning(p_partkey#X, 1), ENSURE_REQUIREMENTS, [p_partkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(p_partkey#X, 1), ENSURE_REQUIREMENTS, [p_partkey#X], [plan_id=X], [shuffle_writer_type=hash] (7) ShuffleQueryStage Output [1]: [p_partkey#X] @@ -195,7 +195,7 @@ Arguments: X, X (15) ColumnarExchange Input [7]: [hash_partition_key#X, l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] -Arguments: hashpartitioning(l_partkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(l_partkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X], [plan_id=X], [shuffle_writer_type=hash] (16) ShuffleQueryStage Output [6]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] @@ -227,7 +227,7 @@ Arguments: X, X (23) ColumnarExchange Input [7]: [hash_partition_key#X, l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] -Arguments: hashpartitioning(l_suppkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(l_suppkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X], [plan_id=X], [shuffle_writer_type=hash] (24) ShuffleQueryStage Output [6]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X] @@ -264,7 +264,7 @@ Arguments: X, X (32) ColumnarExchange Input [3]: [hash_partition_key#X, s_suppkey#X, s_nationkey#X] -Arguments: hashpartitioning(s_suppkey#X, 1), ENSURE_REQUIREMENTS, [s_suppkey#X, s_nationkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(s_suppkey#X, 1), ENSURE_REQUIREMENTS, [s_suppkey#X, s_nationkey#X], [plan_id=X], [shuffle_writer_type=hash] (33) ShuffleQueryStage Output [2]: [s_suppkey#X, s_nationkey#X] @@ -296,7 +296,7 @@ Arguments: X, X (40) ColumnarExchange Input [8]: [hash_partition_key#X, l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X] -Arguments: hashpartitioning(l_suppkey#X, l_partkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(l_suppkey#X, l_partkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X], [plan_id=X], [shuffle_writer_type=hash] (41) ShuffleQueryStage Output [7]: [l_orderkey#X, l_partkey#X, l_suppkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X] @@ -333,7 +333,7 @@ Arguments: X, X (49) ColumnarExchange Input [4]: [hash_partition_key#X, ps_partkey#X, ps_suppkey#X, ps_supplycost#X] -Arguments: hashpartitioning(ps_suppkey#X, ps_partkey#X, 1), ENSURE_REQUIREMENTS, [ps_partkey#X, ps_suppkey#X, ps_supplycost#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(ps_suppkey#X, ps_partkey#X, 1), ENSURE_REQUIREMENTS, [ps_partkey#X, ps_suppkey#X, ps_supplycost#X], [plan_id=X], [shuffle_writer_type=hash] (50) ShuffleQueryStage Output [3]: [ps_partkey#X, ps_suppkey#X, ps_supplycost#X] @@ -365,7 +365,7 @@ Arguments: X, X (57) ColumnarExchange Input [7]: [hash_partition_key#X, l_orderkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_supplycost#X] -Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_supplycost#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(l_orderkey#X, 1), ENSURE_REQUIREMENTS, [l_orderkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_supplycost#X], [plan_id=X], [shuffle_writer_type=hash] (58) ShuffleQueryStage Output [6]: [l_orderkey#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_supplycost#X] @@ -402,7 +402,7 @@ Arguments: X, X (66) ColumnarExchange Input [3]: [hash_partition_key#X, o_orderkey#X, o_orderdate#X] -Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [o_orderkey#X, o_orderdate#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(o_orderkey#X, 1), ENSURE_REQUIREMENTS, [o_orderkey#X, o_orderdate#X], [plan_id=X], [shuffle_writer_type=hash] (67) ShuffleQueryStage Output [2]: [o_orderkey#X, o_orderdate#X] @@ -434,7 +434,7 @@ Arguments: X, X (74) ColumnarExchange Input [7]: [hash_partition_key#X, l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_supplycost#X, o_orderdate#X] -Arguments: hashpartitioning(s_nationkey#X, 1), ENSURE_REQUIREMENTS, [l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_supplycost#X, o_orderdate#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(s_nationkey#X, 1), ENSURE_REQUIREMENTS, [l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_supplycost#X, o_orderdate#X], [plan_id=X], [shuffle_writer_type=hash] (75) ShuffleQueryStage Output [6]: [l_quantity#X, l_extendedprice#X, l_discount#X, s_nationkey#X, ps_supplycost#X, o_orderdate#X] @@ -471,7 +471,7 @@ Arguments: X, X (83) ColumnarExchange Input [3]: [hash_partition_key#X, n_nationkey#X, n_name#X] -Arguments: hashpartitioning(n_nationkey#X, 1), ENSURE_REQUIREMENTS, [n_nationkey#X, n_name#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(n_nationkey#X, 1), ENSURE_REQUIREMENTS, [n_nationkey#X, n_name#X], [plan_id=X], [shuffle_writer_type=hash] (84) ShuffleQueryStage Output [2]: [n_nationkey#X, n_name#X] @@ -514,7 +514,7 @@ Arguments: X, X (93) ColumnarExchange Input [5]: [hash_partition_key#X, nation#X, o_year#X, sum#X, isEmpty#X] -Arguments: hashpartitioning(nation#X, o_year#X, 1), ENSURE_REQUIREMENTS, [nation#X, o_year#X, sum#X, isEmpty#X], [plan_id=X], [id=#X] +Arguments: hashpartitioning(nation#X, o_year#X, 1), ENSURE_REQUIREMENTS, [nation#X, o_year#X, sum#X, isEmpty#X], [plan_id=X], [shuffle_writer_type=hash] (94) ShuffleQueryStage Output [4]: [nation#X, o_year#X, sum#X, isEmpty#X] @@ -543,7 +543,7 @@ Arguments: X, X (100) ColumnarExchange Input [3]: [nation#X, o_year#X, sum_profit#X] -Arguments: rangepartitioning(nation#X ASC NULLS FIRST, o_year#X DESC NULLS LAST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [id=#X] +Arguments: rangepartitioning(nation#X ASC NULLS FIRST, o_year#X DESC NULLS LAST, 1), ENSURE_REQUIREMENTS, [plan_id=X], [shuffle_writer_type=hash] (101) ShuffleQueryStage Output [3]: [nation#X, o_year#X, sum_profit#X] diff --git a/backends-velox/src/test/scala/org/apache/gluten/execution/DynamicOffHeapSizingTest.scala b/backends-velox/src/test/scala/org/apache/gluten/execution/DynamicOffHeapSizingSuite.scala similarity index 96% rename from backends-velox/src/test/scala/org/apache/gluten/execution/DynamicOffHeapSizingTest.scala rename to backends-velox/src/test/scala/org/apache/gluten/execution/DynamicOffHeapSizingSuite.scala index 56fc6eac3e11a..51d06110ed60e 100644 --- a/backends-velox/src/test/scala/org/apache/gluten/execution/DynamicOffHeapSizingTest.scala +++ b/backends-velox/src/test/scala/org/apache/gluten/execution/DynamicOffHeapSizingSuite.scala @@ -22,7 +22,7 @@ import org.apache.gluten.tags.SkipTestTags import org.apache.spark.SparkConf @SkipTestTags -class DynamicOffHeapSizingTest extends VeloxWholeStageTransformerSuite { +class DynamicOffHeapSizingSuite extends VeloxWholeStageTransformerSuite { override protected val resourcePath: String = "/tpch-data-parquet-velox" override protected val fileFormat: String = "parquet" diff --git a/backends-velox/src/test/scala/org/apache/gluten/execution/FunctionsValidateTest.scala b/backends-velox/src/test/scala/org/apache/gluten/execution/FunctionsValidateSuite.scala similarity index 97% rename from backends-velox/src/test/scala/org/apache/gluten/execution/FunctionsValidateTest.scala rename to backends-velox/src/test/scala/org/apache/gluten/execution/FunctionsValidateSuite.scala index 12f66278f70ae..a9c5cf70da645 100644 --- a/backends-velox/src/test/scala/org/apache/gluten/execution/FunctionsValidateTest.scala +++ b/backends-velox/src/test/scala/org/apache/gluten/execution/FunctionsValidateSuite.scala @@ -25,7 +25,7 @@ import java.nio.file.Files import scala.collection.JavaConverters._ -class FunctionsValidateTest extends WholeStageTransformerSuite { +class FunctionsValidateSuite extends WholeStageTransformerSuite { override protected val resourcePath: String = "/tpch-data-parquet-velox" override protected val fileFormat: String = "parquet" private var parquetPath: String = _ diff --git a/backends-velox/src/test/scala/org/apache/gluten/execution/TestOperator.scala b/backends-velox/src/test/scala/org/apache/gluten/execution/MiscOperatorSuite.scala similarity index 98% rename from backends-velox/src/test/scala/org/apache/gluten/execution/TestOperator.scala rename to backends-velox/src/test/scala/org/apache/gluten/execution/MiscOperatorSuite.scala index 5ca5087d9ef4a..fa7eae37b1c9a 100644 --- a/backends-velox/src/test/scala/org/apache/gluten/execution/TestOperator.scala +++ b/backends-velox/src/test/scala/org/apache/gluten/execution/MiscOperatorSuite.scala @@ -35,7 +35,7 @@ import java.util.concurrent.TimeUnit import scala.collection.JavaConverters -class TestOperator extends VeloxWholeStageTransformerSuite with AdaptiveSparkPlanHelper { +class MiscOperatorSuite extends VeloxWholeStageTransformerSuite with AdaptiveSparkPlanHelper { protected val rootPath: String = getClass.getResource("/").getPath override protected val resourcePath: String = "/tpch-data-parquet-velox" @@ -900,12 +900,11 @@ class TestOperator extends VeloxWholeStageTransformerSuite with AdaptiveSparkPla test("combine small batches before shuffle") { val minBatchSize = 15 - val maxBatchSize = 100 withSQLConf( "spark.gluten.sql.columnar.backend.velox.resizeBatches.shuffleInput" -> "true", "spark.gluten.sql.columnar.maxBatchSize" -> "2", - "spark.gluten.sql.columnar.backend.velox.resizeBatches.shuffleInput.range" -> - s"$minBatchSize~$maxBatchSize" + "spark.gluten.sql.columnar.backend.velox.resizeBatches.shuffleInput.minSize" -> + s"$minBatchSize" ) { val df = runQueryAndCompare( "select l_orderkey, sum(l_partkey) as sum from lineitem " + @@ -921,16 +920,10 @@ class TestOperator extends VeloxWholeStageTransformerSuite with AdaptiveSparkPla assert(metrics("numOutputRows").value == 27) assert(metrics("numOutputBatches").value == 2) } - } - test("split small batches before shuffle") { - val minBatchSize = 1 - val maxBatchSize = 4 withSQLConf( "spark.gluten.sql.columnar.backend.velox.resizeBatches.shuffleInput" -> "true", - "spark.gluten.sql.columnar.maxBatchSize" -> "100", - "spark.gluten.sql.columnar.backend.velox.resizeBatches.shuffleInput.range" -> - s"$minBatchSize~$maxBatchSize" + "spark.gluten.sql.columnar.maxBatchSize" -> "2" ) { val df = runQueryAndCompare( "select l_orderkey, sum(l_partkey) as sum from lineitem " + @@ -939,12 +932,12 @@ class TestOperator extends VeloxWholeStageTransformerSuite with AdaptiveSparkPla val ops = collect(df.queryExecution.executedPlan) { case p: VeloxResizeBatchesExec => p } assert(ops.size == 1) val op = ops.head - assert(op.minOutputBatchSize == minBatchSize) + assert(op.minOutputBatchSize == 1) val metrics = op.metrics assert(metrics("numInputRows").value == 27) - assert(metrics("numInputBatches").value == 1) + assert(metrics("numInputBatches").value == 14) assert(metrics("numOutputRows").value == 27) - assert(metrics("numOutputBatches").value == 7) + assert(metrics("numOutputBatches").value == 14) } } @@ -1624,9 +1617,11 @@ class TestOperator extends VeloxWholeStageTransformerSuite with AdaptiveSparkPla path => (0 to 3).toDF("x").write.parquet(path.getCanonicalPath) spark.read.parquet(path.getCanonicalPath).createOrReplaceTempView("view") - runQueryAndCompare( - "SELECT x FROM view WHERE cast(x as timestamp) " + - "IN ('1970-01-01 08:00:00.001','1970-01-01 08:00:00.2')")(_) + runQueryAndCompare(s""" + |SELECT x FROM view + |WHERE cast(x as timestamp) + |IN ('1970-01-01 08:00:00.001','1970-01-01 08:00:00.2') + |""".stripMargin)(_ => ()) } } @@ -2083,4 +2078,12 @@ class TestOperator extends VeloxWholeStageTransformerSuite with AdaptiveSparkPla checkGlutenOperatorMatch[SortExecTransformer] } } + + // Enable the test after fixing https://github.com/apache/incubator-gluten/issues/6827 + ignore("Test round expression") { + val df1 = runQueryAndCompare("SELECT round(cast(0.5549999999999999 as double), 2)") { _ => } + checkLengthAndPlan(df1, 1) + val df2 = runQueryAndCompare("SELECT round(cast(0.19324999999999998 as double), 2)") { _ => } + checkLengthAndPlan(df2, 1) + } } diff --git a/backends-velox/src/test/scala/org/apache/gluten/execution/ScalarFunctionsValidateSuite.scala b/backends-velox/src/test/scala/org/apache/gluten/execution/ScalarFunctionsValidateSuite.scala index 897c1c5f58d5e..b8de30b1b06f7 100644 --- a/backends-velox/src/test/scala/org/apache/gluten/execution/ScalarFunctionsValidateSuite.scala +++ b/backends-velox/src/test/scala/org/apache/gluten/execution/ScalarFunctionsValidateSuite.scala @@ -67,7 +67,7 @@ class ScalarFunctionsValidateSuiteRasOn extends ScalarFunctionsValidateSuite { } } -abstract class ScalarFunctionsValidateSuite extends FunctionsValidateTest { +abstract class ScalarFunctionsValidateSuite extends FunctionsValidateSuite { disableFallbackCheck import testImplicits._ @@ -263,20 +263,28 @@ abstract class ScalarFunctionsValidateSuite extends FunctionsValidateTest { } } - test("Test get_json_object datatab function") { + test("get_json_object") { runQueryAndCompare( "SELECT get_json_object(string_field1, '$.a') " + "from datatab limit 1;") { checkGlutenOperatorMatch[ProjectExecTransformer] } - } - test("Test get_json_object lineitem function") { runQueryAndCompare( "SELECT l_orderkey, get_json_object('{\"a\":\"b\"}', '$.a') " + "from lineitem limit 1;") { checkGlutenOperatorMatch[ProjectExecTransformer] } + + // Invalid UTF-8 encoding. + spark.sql( + "CREATE TABLE t USING parquet SELECT concat('{\"a\": 2, \"'," + + " string(X'80'), '\": 3, \"c\": 100}') AS c1") + withTable("t") { + runQueryAndCompare("SELECT get_json_object(c1, '$.c') FROM t;") { + checkGlutenOperatorMatch[ProjectExecTransformer] + } + } } ignore("json_array_length") { diff --git a/backends-velox/src/test/scala/org/apache/gluten/execution/VeloxAggregateFunctionsSuite.scala b/backends-velox/src/test/scala/org/apache/gluten/execution/VeloxAggregateFunctionsSuite.scala index fb1f0542639f7..dc488d44672c9 100644 --- a/backends-velox/src/test/scala/org/apache/gluten/execution/VeloxAggregateFunctionsSuite.scala +++ b/backends-velox/src/test/scala/org/apache/gluten/execution/VeloxAggregateFunctionsSuite.scala @@ -1180,7 +1180,7 @@ class VeloxAggregateFunctionsDefaultSuite extends VeloxAggregateFunctionsSuite { .set(GlutenConfig.VELOX_FLUSHABLE_PARTIAL_AGGREGATION_ENABLED.key, "false") } - test("group sets with keys") { + test("flushable aggregate rule") { withSQLConf(SQLConf.ADAPTIVE_EXECUTION_ENABLED.key -> "false") { runQueryAndCompare(VeloxAggregateFunctionsSuite.GROUP_SETS_TEST_SQL) { df => @@ -1218,6 +1218,24 @@ class VeloxAggregateFunctionsFlushSuite extends VeloxAggregateFunctionsSuite { } } + test("flushable aggregate rule - agg input already distributed by keys") { + withSQLConf( + SQLConf.ADAPTIVE_EXECUTION_ENABLED.key -> "false", + SQLConf.FILES_MAX_PARTITION_BYTES.key -> "1k") { + runQueryAndCompare( + "select * from (select distinct l_orderkey,l_partkey from lineitem) a" + + " inner join (select l_orderkey from lineitem limit 10) b" + + " on a.l_orderkey = b.l_orderkey limit 10") { + df => + val executedPlan = getExecutedPlan(df) + assert( + executedPlan.exists(plan => plan.isInstanceOf[RegularHashAggregateExecTransformer])) + assert( + executedPlan.exists(plan => plan.isInstanceOf[FlushableHashAggregateExecTransformer])) + } + } + } + test("flushable aggregate decimal sum") { withSQLConf( SQLConf.ADAPTIVE_EXECUTION_ENABLED.key -> "false", diff --git a/backends-velox/src/test/scala/org/apache/gluten/execution/VeloxRoughCostModelSuite.scala b/backends-velox/src/test/scala/org/apache/gluten/execution/VeloxRoughCostModelSuite.scala new file mode 100644 index 0000000000000..ca3bbb0b1e723 --- /dev/null +++ b/backends-velox/src/test/scala/org/apache/gluten/execution/VeloxRoughCostModelSuite.scala @@ -0,0 +1,54 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.gluten.execution + +import org.apache.gluten.GlutenConfig + +import org.apache.spark.SparkConf +import org.apache.spark.sql.execution.ProjectExec + +class VeloxRoughCostModelSuite extends VeloxWholeStageTransformerSuite { + override protected val resourcePath: String = "/tpch-data-parquet-velox" + override protected val fileFormat: String = "parquet" + + override def beforeAll(): Unit = { + super.beforeAll() + spark + .range(100) + .selectExpr("cast(id % 3 as int) as c1", "id as c2") + .write + .format("parquet") + .saveAsTable("tmp1") + } + + override protected def afterAll(): Unit = { + spark.sql("drop table tmp1") + super.afterAll() + } + + override protected def sparkConf: SparkConf = super.sparkConf + .set(GlutenConfig.RAS_ENABLED.key, "true") + .set(GlutenConfig.RAS_COST_MODEL.key, "rough") + + test("fallback trivial project if its neighbor nodes fell back") { + withSQLConf(GlutenConfig.COLUMNAR_FILESCAN_ENABLED.key -> "false") { + runQueryAndCompare("select c1 as c3 from tmp1") { + checkSparkOperatorMatch[ProjectExec] + } + } + } +} diff --git a/backends-velox/src/test/scala/org/apache/gluten/execution/VeloxScanSuite.scala b/backends-velox/src/test/scala/org/apache/gluten/execution/VeloxScanSuite.scala index 688cca6995c04..852a1f4fbc66e 100644 --- a/backends-velox/src/test/scala/org/apache/gluten/execution/VeloxScanSuite.scala +++ b/backends-velox/src/test/scala/org/apache/gluten/execution/VeloxScanSuite.scala @@ -16,6 +16,10 @@ */ package org.apache.gluten.execution +import org.apache.gluten.GlutenConfig +import org.apache.gluten.backendsapi.velox.VeloxBackendSettings +import org.apache.gluten.utils.VeloxFileSystemValidationJniWrapper + import org.apache.spark.SparkConf import org.apache.spark.sql.catalyst.expressions.GreaterThan import org.apache.spark.sql.execution.ScalarSubquery @@ -74,4 +78,44 @@ class VeloxScanSuite extends VeloxWholeStageTransformerSuite { } } } + + test("Test file scheme validation") { + withTempPath { + path => + withSQLConf(GlutenConfig.NATIVE_WRITER_ENABLED.key -> "false") { + spark + .range(100) + .selectExpr("cast(id % 9 as int) as c1") + .write + .format("parquet") + .save(path.getCanonicalPath) + runQueryAndCompare(s"SELECT count(*) FROM `parquet`.`${path.getCanonicalPath}`") { + df => + val plan = df.queryExecution.executedPlan + val fileScan = collect(plan) { case s: FileSourceScanExecTransformer => s } + assert(fileScan.size == 1) + val rootPaths = fileScan(0).getRootPathsInternal + assert(rootPaths.length == 1) + assert(rootPaths(0).startsWith("file:/")) + assert( + VeloxFileSystemValidationJniWrapper.allSupportedByRegisteredFileSystems( + rootPaths.toArray)) + } + } + } + val filteredRootPath = + VeloxBackendSettings.distinctRootPaths( + Seq("file:/test_path/", "test://test/s", "test://test1/s")) + assert(filteredRootPath.length == 1) + assert(filteredRootPath(0).startsWith("test://")) + assert( + VeloxFileSystemValidationJniWrapper.allSupportedByRegisteredFileSystems( + Array("file:/test_path/"))) + assert( + !VeloxFileSystemValidationJniWrapper.allSupportedByRegisteredFileSystems( + Array("unsupported://test_path"))) + assert( + !VeloxFileSystemValidationJniWrapper.allSupportedByRegisteredFileSystems( + Array("file:/test_path/", "unsupported://test_path"))) + } } diff --git a/backends-velox/src/test/scala/org/apache/gluten/execution/VeloxStringFunctionsSuite.scala b/backends-velox/src/test/scala/org/apache/gluten/execution/VeloxStringFunctionsSuite.scala index 2aedde12a5a98..9357bc754d5d4 100644 --- a/backends-velox/src/test/scala/org/apache/gluten/execution/VeloxStringFunctionsSuite.scala +++ b/backends-velox/src/test/scala/org/apache/gluten/execution/VeloxStringFunctionsSuite.scala @@ -519,23 +519,56 @@ class VeloxStringFunctionsSuite extends VeloxWholeStageTransformerSuite { s"from $LINEITEM_TABLE limit 5") { _ => } } - ignore("split") { + testWithSpecifiedSparkVersion("split", Some("3.4")) { runQueryAndCompare( - s"select l_orderkey, l_comment, split(l_comment, ' ', 3) " + - s"from $LINEITEM_TABLE limit 5") { _ => } + s"select l_orderkey, l_comment, split(l_comment, '') " + + s"from $LINEITEM_TABLE limit 5") { + checkGlutenOperatorMatch[ProjectExecTransformer] + } + runQueryAndCompare( + s"select l_orderkey, l_comment, split(l_comment, '', 1) " + + s"from $LINEITEM_TABLE limit 5") { + checkGlutenOperatorMatch[ProjectExecTransformer] + } - // todo incorrect results runQueryAndCompare( - s"select l_orderkey, l_comment, split(l_comment, '[a]', 3) " + - s"from $LINEITEM_TABLE limit 5") { _ => } + s"select l_orderkey, l_comment, split(l_comment, ',') " + + s"from $LINEITEM_TABLE limit 5") { + checkGlutenOperatorMatch[ProjectExecTransformer] + } + runQueryAndCompare( + s"select l_orderkey, l_comment, split(l_comment, ',', 10) " + + s"from $LINEITEM_TABLE limit 5")(checkGlutenOperatorMatch[ProjectExecTransformer]) runQueryAndCompare( s"select l_orderkey, split(l_comment, ' ') " + - s"from $LINEITEM_TABLE limit 5") { _ => } + s"from $LINEITEM_TABLE limit 5")(checkGlutenOperatorMatch[ProjectExecTransformer]) + runQueryAndCompare( + s"select l_orderkey, l_comment, split(l_comment, ' ', 3) " + + s"from $LINEITEM_TABLE limit 5")(checkGlutenOperatorMatch[ProjectExecTransformer]) runQueryAndCompare( - s"select l_orderkey, split(l_comment, 'h') " + - s"from $LINEITEM_TABLE limit 5") { _ => } + s"select l_orderkey, l_comment, split(l_comment, '[a-z]+') " + + s"from $LINEITEM_TABLE limit 5")(checkGlutenOperatorMatch[ProjectExecTransformer]) + runQueryAndCompare( + s"select l_orderkey, l_comment, split(l_comment, '[a-z]+', 3) " + + s"from $LINEITEM_TABLE limit 5")(checkGlutenOperatorMatch[ProjectExecTransformer]) + + runQueryAndCompare( + s"select l_orderkey, split(l_comment, '[1-9]+', -2) " + + s"from $LINEITEM_TABLE limit 5")(checkGlutenOperatorMatch[ProjectExecTransformer]) + runQueryAndCompare( + s"select l_orderkey, split(l_comment, '[1-9]+', 0) " + + s"from $LINEITEM_TABLE limit 5")(checkGlutenOperatorMatch[ProjectExecTransformer]) + + runQueryAndCompare( + s"select l_orderkey, l_comment, split(l_comment, 'h') " + + s"from $LINEITEM_TABLE limit 5") { + checkGlutenOperatorMatch[ProjectExecTransformer] + } + runQueryAndCompare( + s"select l_orderkey, l_comment, split(l_comment, '[a]', 3) " + + s"from $LINEITEM_TABLE limit 5")(checkGlutenOperatorMatch[ProjectExecTransformer]) } test("substr") { diff --git a/backends-velox/src/test/scala/org/apache/gluten/execution/VeloxTPCHSuite.scala b/backends-velox/src/test/scala/org/apache/gluten/execution/VeloxTPCHSuite.scala index 22f96bbbc4c2a..0e94c242c1db2 100644 --- a/backends-velox/src/test/scala/org/apache/gluten/execution/VeloxTPCHSuite.scala +++ b/backends-velox/src/test/scala/org/apache/gluten/execution/VeloxTPCHSuite.scala @@ -255,7 +255,7 @@ class VeloxTPCHDistinctSpillSuite extends VeloxTPCHTableSupport { super.sparkConf .set("spark.memory.offHeap.size", "50m") .set("spark.gluten.memory.overAcquiredMemoryRatio", "0.9") // to trigger distinct spill early - .set(GlutenConfig.GLUTEN_COLUMNAR_TO_ROW_MEM_THRESHOLD_KEY, "8k") + .set(GlutenConfig.GLUTEN_COLUMNAR_TO_ROW_MEM_THRESHOLD.key, "8k") } test("distinct spill") { diff --git a/backends-velox/src/test/scala/org/apache/gluten/execution/WindowFunctionsValidateSuite.scala b/backends-velox/src/test/scala/org/apache/gluten/execution/WindowFunctionsValidateSuite.scala index ef9c028be46d2..04d0d2c56b94f 100644 --- a/backends-velox/src/test/scala/org/apache/gluten/execution/WindowFunctionsValidateSuite.scala +++ b/backends-velox/src/test/scala/org/apache/gluten/execution/WindowFunctionsValidateSuite.scala @@ -16,7 +16,7 @@ */ package org.apache.gluten.execution -class WindowFunctionsValidateSuite extends FunctionsValidateTest { +class WindowFunctionsValidateSuite extends FunctionsValidateSuite { test("lag/lead window function with negative input offset") { runQueryAndCompare( diff --git a/backends-velox/src/test/scala/org/apache/gluten/expression/VeloxUdfSuite.scala b/backends-velox/src/test/scala/org/apache/gluten/expression/VeloxUdfSuite.scala index 534a8d9f1c74d..008337b9400ed 100644 --- a/backends-velox/src/test/scala/org/apache/gluten/expression/VeloxUdfSuite.scala +++ b/backends-velox/src/test/scala/org/apache/gluten/expression/VeloxUdfSuite.scala @@ -16,6 +16,7 @@ */ package org.apache.gluten.expression +import org.apache.gluten.backendsapi.velox.VeloxBackendSettings import org.apache.gluten.tags.{SkipTestTags, UDFTest} import org.apache.spark.SparkConf @@ -88,6 +89,23 @@ abstract class VeloxUdfSuite extends GlutenQueryTest with SQLHelper { .sameElements(Array(Row(105L, 6, 6L, 5, 6, 11, 6L, 11L, Date.valueOf("2024-03-30"))))) } + test("test udf allow type conversion") { + withSQLConf(VeloxBackendSettings.GLUTEN_VELOX_UDF_ALLOW_TYPE_CONVERSION -> "true") { + val df = spark.sql("""select myudf1("100"), myudf1(1), mydate('2024-03-25', 5)""") + assert( + df.collect() + .sameElements(Array(Row(105L, 6L, Date.valueOf("2024-03-30"))))) + } + + withSQLConf(VeloxBackendSettings.GLUTEN_VELOX_UDF_ALLOW_TYPE_CONVERSION -> "false") { + assert( + spark + .sql("select mydate2('2024-03-25', 5)") + .collect() + .sameElements(Array(Row(Date.valueOf("2024-03-30"))))) + } + } + test("test udaf") { val df = spark.sql("""select | myavg(1), @@ -101,6 +119,15 @@ abstract class VeloxUdfSuite extends GlutenQueryTest with SQLHelper { df.collect() .sameElements(Array(Row(1.0, 1.0, 1.0, 1.0, 1L)))) } + + test("test udaf allow type conversion") { + withSQLConf(VeloxBackendSettings.GLUTEN_VELOX_UDF_ALLOW_TYPE_CONVERSION -> "true") { + val df = spark.sql("""select myavg("1"), myavg("1.0"), mycount_if("true")""") + assert( + df.collect() + .sameElements(Array(Row(1.0, 1.0, 1L)))) + } + } } @UDFTest diff --git a/backends-velox/src/test/scala/org/apache/gluten/fuzzer/FuzzerTestBase.scala b/backends-velox/src/test/scala/org/apache/gluten/fuzzer/FuzzerBase.scala similarity index 96% rename from backends-velox/src/test/scala/org/apache/gluten/fuzzer/FuzzerTestBase.scala rename to backends-velox/src/test/scala/org/apache/gluten/fuzzer/FuzzerBase.scala index 1ee79a2ade873..07996bc4b2666 100644 --- a/backends-velox/src/test/scala/org/apache/gluten/fuzzer/FuzzerTestBase.scala +++ b/backends-velox/src/test/scala/org/apache/gluten/fuzzer/FuzzerBase.scala @@ -18,12 +18,12 @@ package org.apache.gluten.fuzzer import org.apache.gluten.benchmarks.RandomParquetDataGenerator import org.apache.gluten.execution.VeloxWholeStageTransformerSuite -import org.apache.gluten.fuzzer.FuzzerTestResult.{Failed, OOM, Successful, TestResult} +import org.apache.gluten.fuzzer.FuzzerResult.{Failed, OOM, Successful, TestResult} import org.apache.gluten.memory.memtarget.ThrowOnOomMemoryTarget import org.apache.spark.SparkConf -abstract class FuzzerTestBase extends VeloxWholeStageTransformerSuite { +abstract class FuzzerBase extends VeloxWholeStageTransformerSuite { override protected val resourcePath: String = "/tpch-data-parquet-velox" override protected val fileFormat: String = "parquet" diff --git a/backends-velox/src/test/scala/org/apache/gluten/fuzzer/FuzzerTestResult.scala b/backends-velox/src/test/scala/org/apache/gluten/fuzzer/FuzzerResult.scala similarity index 97% rename from backends-velox/src/test/scala/org/apache/gluten/fuzzer/FuzzerTestResult.scala rename to backends-velox/src/test/scala/org/apache/gluten/fuzzer/FuzzerResult.scala index 42f901b068b4a..cff9ffec8fd54 100644 --- a/backends-velox/src/test/scala/org/apache/gluten/fuzzer/FuzzerTestResult.scala +++ b/backends-velox/src/test/scala/org/apache/gluten/fuzzer/FuzzerResult.scala @@ -16,7 +16,7 @@ */ package org.apache.gluten.fuzzer -object FuzzerTestResult { +object FuzzerResult { trait TestResult { val seed: Long diff --git a/backends-velox/src/test/scala/org/apache/gluten/fuzzer/RowToColumnarFuzzerTest.scala b/backends-velox/src/test/scala/org/apache/gluten/fuzzer/RowToColumnarFuzzer.scala similarity index 95% rename from backends-velox/src/test/scala/org/apache/gluten/fuzzer/RowToColumnarFuzzerTest.scala rename to backends-velox/src/test/scala/org/apache/gluten/fuzzer/RowToColumnarFuzzer.scala index b6d986047e2c4..d60e577aca09c 100644 --- a/backends-velox/src/test/scala/org/apache/gluten/fuzzer/RowToColumnarFuzzerTest.scala +++ b/backends-velox/src/test/scala/org/apache/gluten/fuzzer/RowToColumnarFuzzer.scala @@ -17,7 +17,7 @@ package org.apache.gluten.fuzzer import org.apache.gluten.execution.RowToVeloxColumnarExec -import org.apache.gluten.fuzzer.FuzzerTestResult.Successful +import org.apache.gluten.fuzzer.FuzzerResult.Successful import org.apache.gluten.tags.{FuzzerTest, SkipTestTags} import org.apache.spark.SparkConf @@ -25,7 +25,7 @@ import org.apache.spark.sql.DataFrame @FuzzerTest @SkipTestTags -class RowToColumnarFuzzerTest extends FuzzerTestBase { +class RowToColumnarFuzzer extends FuzzerBase { override protected def sparkConf: SparkConf = { super.sparkConf diff --git a/backends-velox/src/test/scala/org/apache/gluten/fuzzer/ShuffleWriterFuzzerTest.scala b/backends-velox/src/test/scala/org/apache/gluten/fuzzer/ShuffleWriterFuzzer.scala similarity index 96% rename from backends-velox/src/test/scala/org/apache/gluten/fuzzer/ShuffleWriterFuzzerTest.scala rename to backends-velox/src/test/scala/org/apache/gluten/fuzzer/ShuffleWriterFuzzer.scala index 7d8fc56d9728a..44f25b517b467 100644 --- a/backends-velox/src/test/scala/org/apache/gluten/fuzzer/ShuffleWriterFuzzerTest.scala +++ b/backends-velox/src/test/scala/org/apache/gluten/fuzzer/ShuffleWriterFuzzer.scala @@ -16,7 +16,7 @@ */ package org.apache.gluten.fuzzer -import org.apache.gluten.fuzzer.FuzzerTestResult.Successful +import org.apache.gluten.fuzzer.FuzzerResult.Successful import org.apache.gluten.tags.{FuzzerTest, SkipTestTags} import org.apache.spark.sql.DataFrame @@ -24,7 +24,7 @@ import org.apache.spark.sql.execution.ColumnarShuffleExchangeExec @FuzzerTest @SkipTestTags -class ShuffleWriterFuzzerTest extends FuzzerTestBase { +class ShuffleWriterFuzzer extends FuzzerBase { private val REPARTITION_SQL = (numPartitions: Int) => s"select /*+ REPARTITION($numPartitions) */ * from tbl" private val AGG_REPARTITION_SQL = diff --git a/backends-velox/src/test/scala/org/apache/gluten/planner/VeloxRasSuite.scala b/backends-velox/src/test/scala/org/apache/gluten/planner/VeloxRasSuite.scala index 1806eacfc6779..646e94c696cca 100644 --- a/backends-velox/src/test/scala/org/apache/gluten/planner/VeloxRasSuite.scala +++ b/backends-velox/src/test/scala/org/apache/gluten/planner/VeloxRasSuite.scala @@ -195,7 +195,7 @@ object VeloxRasSuite { } class UserCostModel1 extends CostModel[SparkPlan] { - private val base = GlutenCostModel.rough() + private val base = GlutenCostModel.legacy() override def costOf(node: SparkPlan): Cost = node match { case _: RowUnary => base.makeInfCost() case other => base.costOf(other) @@ -205,7 +205,7 @@ object VeloxRasSuite { } class UserCostModel2 extends CostModel[SparkPlan] { - private val base = GlutenCostModel.rough() + private val base = GlutenCostModel.legacy() override def costOf(node: SparkPlan): Cost = node match { case _: ColumnarUnary => base.makeInfCost() case other => base.costOf(other) diff --git a/cpp-ch/clickhouse.version b/cpp-ch/clickhouse.version index 46b23e5f0e619..b88675c4a96d6 100644 --- a/cpp-ch/clickhouse.version +++ b/cpp-ch/clickhouse.version @@ -1,4 +1,3 @@ CH_ORG=Kyligence -CH_BRANCH=rebase_ch/20240806 -CH_COMMIT=c8a7d6e496d - +CH_BRANCH=rebase_ch/20240820 +CH_COMMIT=b5b8245b022 diff --git a/cpp-ch/local-engine/AggregateFunctions/AggregateFunctionGroupBloomFilter.cpp b/cpp-ch/local-engine/AggregateFunctions/AggregateFunctionGroupBloomFilter.cpp index 5555302a5c2fd..1b853cc67c691 100644 --- a/cpp-ch/local-engine/AggregateFunctions/AggregateFunctionGroupBloomFilter.cpp +++ b/cpp-ch/local-engine/AggregateFunctions/AggregateFunctionGroupBloomFilter.cpp @@ -62,10 +62,10 @@ createAggregateFunctionBloomFilter(const std::string & name, const DataTypes & a if (type != Field::Types::Int64 && type != Field::Types::UInt64) throw Exception(ErrorCodes::BAD_ARGUMENTS, "Parameter for aggregate function {} should be Int64 or UInt64", name); - if ((type == Field::Types::Int64 && parameters[i].get() < 0)) + if ((type == Field::Types::Int64 && parameters[i].safeGet() < 0)) throw Exception(ErrorCodes::BAD_ARGUMENTS, "Parameter for aggregate function {} should be non-negative number", name); - return parameters[i].get(); + return parameters[i].safeGet(); }; filter_size = get_parameter(0); diff --git a/cpp-ch/local-engine/AggregateFunctions/AggregateFunctionSparkAvg.cpp b/cpp-ch/local-engine/AggregateFunctions/AggregateFunctionSparkAvg.cpp index 5eb3a0b360575..0aa2331457284 100644 --- a/cpp-ch/local-engine/AggregateFunctions/AggregateFunctionSparkAvg.cpp +++ b/cpp-ch/local-engine/AggregateFunctions/AggregateFunctionSparkAvg.cpp @@ -140,7 +140,7 @@ createAggregateFunctionSparkAvg(const std::string & name, const DataTypes & argu throw Exception( ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Illegal type {} of argument for aggregate function {}", data_type->getName(), name); - bool allowPrecisionLoss = settings->get(DECIMAL_OPERATIONS_ALLOW_PREC_LOSS).get(); + bool allowPrecisionLoss = settings->get(DECIMAL_OPERATIONS_ALLOW_PREC_LOSS).safeGet(); const UInt32 p1 = DB::getDecimalPrecision(*data_type); const UInt32 s1 = DB::getDecimalScale(*data_type); auto [p2, s2] = GlutenDecimalUtils::LONG_DECIMAL; diff --git a/cpp-ch/local-engine/Common/CHUtil.cpp b/cpp-ch/local-engine/Common/CHUtil.cpp index d32eed92340a4..9558bf957d4a5 100644 --- a/cpp-ch/local-engine/Common/CHUtil.cpp +++ b/cpp-ch/local-engine/Common/CHUtil.cpp @@ -51,11 +51,13 @@ #include #include #include +#include #include #include #include #include #include +#include #include #include #include @@ -72,7 +74,6 @@ #include #include #include -#include namespace DB { @@ -463,20 +464,22 @@ const DB::ColumnWithTypeAndName * NestedColumnExtractHelper::findColumn(const DB const DB::ActionsDAG::Node * ActionsDAGUtil::convertNodeType( DB::ActionsDAG & actions_dag, const DB::ActionsDAG::Node * node, - const std::string & type_name, + const DataTypePtr & cast_to_type, const std::string & result_name, CastType cast_type) { DB::ColumnWithTypeAndName type_name_col; - type_name_col.name = type_name; + type_name_col.name = cast_to_type->getName(); type_name_col.column = DB::DataTypeString().createColumnConst(0, type_name_col.name); type_name_col.type = std::make_shared(); const auto * right_arg = &actions_dag.addColumn(std::move(type_name_col)); const auto * left_arg = node; DB::CastDiagnostic diagnostic = {node->result_name, node->result_name}; + ColumnWithTypeAndName left_column{nullptr, node->result_type, {}}; DB::ActionsDAG::NodeRawConstPtrs children = {left_arg, right_arg}; - return &actions_dag.addFunction( - DB::createInternalCastOverloadResolver(cast_type, std::move(diagnostic)), std::move(children), result_name); + auto func_base_cast = createInternalCast(std::move(left_column), cast_to_type, cast_type, diagnostic); + + return &actions_dag.addFunction(func_base_cast, std::move(children), result_name); } const DB::ActionsDAG::Node * ActionsDAGUtil::convertNodeTypeIfNeeded( @@ -489,7 +492,7 @@ const DB::ActionsDAG::Node * ActionsDAGUtil::convertNodeTypeIfNeeded( if (node->result_type->equals(*dst_type)) return node; - return convertNodeType(actions_dag, node, dst_type->getName(), result_name, cast_type); + return convertNodeType(actions_dag, node, dst_type, result_name, cast_type); } String QueryPipelineUtil::explainPipeline(DB::QueryPipeline & pipeline) @@ -570,6 +573,18 @@ std::vector BackendInitializerUtil::wrapDiskPathConfig( std::vector changed_paths; if (path_prefix.empty() && path_suffix.empty()) return changed_paths; + + auto change_func = [&](String key) -> void + { + if (const String value = config.getString(key, ""); value != "") + { + const String change_value = path_prefix + value + path_suffix; + config.setString(key, change_value); + changed_paths.emplace_back(change_value); + LOG_INFO(getLogger("BackendInitializerUtil"), "Change config `{}` from '{}' to {}.", key, value, change_value); + } + }; + Poco::Util::AbstractConfiguration::Keys disks; std::unordered_set disk_types = {"s3_gluten", "hdfs_gluten", "cache"}; config.keys("storage_configuration.disks", disks); @@ -583,26 +598,14 @@ std::vector BackendInitializerUtil::wrapDiskPathConfig( if (!disk_types.contains(disk_type)) return; if (disk_type == "cache") - { - String path = config.getString(disk_prefix + ".path", ""); - if (!path.empty()) - { - String final_path = path_prefix + path + path_suffix; - config.setString(disk_prefix + ".path", final_path); - changed_paths.emplace_back(final_path); - } - } + change_func(disk_prefix + ".path"); else if (disk_type == "s3_gluten" || disk_type == "hdfs_gluten") - { - String metadata_path = config.getString(disk_prefix + ".metadata_path", ""); - if (!metadata_path.empty()) - { - String final_path = path_prefix + metadata_path + path_suffix; - config.setString(disk_prefix + ".metadata_path", final_path); - changed_paths.emplace_back(final_path); - } - } + change_func(disk_prefix + ".metadata_path"); }); + + change_func("path"); + change_func("gluten_cache.local.path"); + return changed_paths; } @@ -781,7 +784,6 @@ void BackendInitializerUtil::initSettings(std::map & b settings.set("input_format_parquet_import_nested", true); settings.set("input_format_json_read_numbers_as_strings", true); settings.set("input_format_json_read_bools_as_numbers", false); - settings.set("input_format_json_case_insensitive_column_matching", true); settings.set("input_format_csv_trim_whitespaces", false); settings.set("input_format_csv_allow_cr_end_of_line", true); settings.set("output_format_orc_string_as_string", true); @@ -977,6 +979,7 @@ void BackendInitializerUtil::init(const std::string_view plan) // Init the table metadata cache map StorageMergeTreeFactory::init_cache_map(); + JobScheduler::initialize(SerializedPlanParser::global_context); CacheManager::initialize(SerializedPlanParser::global_context); std::call_once( diff --git a/cpp-ch/local-engine/Common/CHUtil.h b/cpp-ch/local-engine/Common/CHUtil.h index 785d5d6c00565..a92155d14ea18 100644 --- a/cpp-ch/local-engine/Common/CHUtil.h +++ b/cpp-ch/local-engine/Common/CHUtil.h @@ -128,8 +128,8 @@ class ActionsDAGUtil public: static const DB::ActionsDAG::Node * convertNodeType( DB::ActionsDAG & actions_dag, - const DB::ActionsDAG::Node * node, - const std::string & type_name, + const DB::ActionsDAG::Node * node_to_cast, + const DB::DataTypePtr & cast_to_type, const std::string & result_name = "", DB::CastType cast_type = DB::CastType::nonAccurate); @@ -195,6 +195,8 @@ class BackendInitializerUtil inline static const String GLUTEN_TASK_OFFHEAP = "spark.gluten.memory.task.offHeap.size.in.bytes"; + inline static const String GLUTEN_LOCAL_CACHE_PREFIX = "gluten_cache.local."; + /// On yarn mode, native writing on hdfs cluster takes yarn container user as the user passed to libhdfs3, which /// will cause permission issue because yarn container user is not the owner of the hdfs dir to be written. /// So we need to get the spark user from env and pass it to libhdfs3. diff --git a/cpp-ch/local-engine/Common/ConcurrentMap.h b/cpp-ch/local-engine/Common/ConcurrentMap.h index 1719d9b255eaa..2db35102215ae 100644 --- a/cpp-ch/local-engine/Common/ConcurrentMap.h +++ b/cpp-ch/local-engine/Common/ConcurrentMap.h @@ -16,7 +16,7 @@ */ #pragma once -#include +#include #include namespace local_engine diff --git a/cpp-ch/local-engine/Common/GlutenConfig.h b/cpp-ch/local-engine/Common/GlutenConfig.h index 782df7f5413d4..38c4ce1621381 100644 --- a/cpp-ch/local-engine/Common/GlutenConfig.h +++ b/cpp-ch/local-engine/Common/GlutenConfig.h @@ -17,9 +17,10 @@ #pragma once -#include -#include #include +#include +#include +#include namespace local_engine { @@ -38,7 +39,7 @@ struct MemoryConfig MemoryConfig config; config.extra_memory_hard_limit = context->getConfigRef().getUInt64(EXTRA_MEMORY_HARD_LIMIT, 0); config.off_heap_per_task = context->getConfigRef().getUInt64(CH_TASK_MEMORY, 0); - config.spill_mem_ratio = context->getConfigRef().getUInt64(SPILL_MEM_RATIO, 0.9); + config.spill_mem_ratio = context->getConfigRef().getDouble(SPILL_MEM_RATIO, 0.9); return config; } }; @@ -92,6 +93,27 @@ struct StreamingAggregateConfig } }; +struct JoinConfig +{ + /// If the join condition is like `t1.k = t2.k and (t1.id1 = t2.id2 or t1.id2 = t2.id2)`, try to join with multi + /// join on clauses `(t1.k = t2.k and t1.id1 = t2.id2) or (t1.k = t2.k or t1.id2 = t2.id2)` + inline static const String PREFER_MULTI_JOIN_ON_CLAUSES = "prefer_multi_join_on_clauses"; + /// Only hash join supports multi join on clauses, the right table cannot be too large. If the row number of right + /// table is larger then this limit, this transform will not work. + inline static const String MULTI_JOIN_ON_CLAUSES_BUILD_SIDE_ROWS_LIMIT = "multi_join_on_clauses_build_side_row_limit"; + + bool prefer_multi_join_on_clauses = true; + size_t multi_join_on_clauses_build_side_rows_limit = 10000000; + + static JoinConfig loadFromContext(const DB::ContextPtr & context) + { + JoinConfig config; + config.prefer_multi_join_on_clauses = context->getConfigRef().getBool(PREFER_MULTI_JOIN_ON_CLAUSES, true); + config.multi_join_on_clauses_build_side_rows_limit = context->getConfigRef().getUInt64(MULTI_JOIN_ON_CLAUSES_BUILD_SIDE_ROWS_LIMIT, 10000000); + return config; + } +}; + struct ExecutorConfig { inline static const String DUMP_PIPELINE = "dump_pipeline"; @@ -113,13 +135,17 @@ struct HdfsConfig { inline static const String HDFS_ASYNC = "hdfs.enable_async_io"; - bool hdfs_async = true; + bool hdfs_async; - static HdfsConfig loadFromContext(DB::ContextPtr context) + static HdfsConfig loadFromContext(const Poco::Util::AbstractConfiguration & config, const DB::ReadSettings & read_settings) { - HdfsConfig config; - config.hdfs_async = context->getConfigRef().getBool(HDFS_ASYNC, true); - return config; + HdfsConfig hdfs; + if (read_settings.enable_filesystem_cache) + hdfs.hdfs_async = false; + else + hdfs.hdfs_async = config.getBool(HDFS_ASYNC, true); + + return hdfs; } }; @@ -138,10 +164,17 @@ struct S3Config static S3Config loadFromContext(DB::ContextPtr context) { S3Config config; - config.s3_local_cache_enabled = context->getConfigRef().getBool(S3_LOCAL_CACHE_ENABLE, false); - config.s3_local_cache_max_size = context->getConfigRef().getUInt64(S3_LOCAL_CACHE_MAX_SIZE, 100_GiB); - config.s3_local_cache_cache_path = context->getConfigRef().getString(S3_LOCAL_CACHE_CACHE_PATH, ""); - config.s3_gcs_issue_compose_request = context->getConfigRef().getBool(S3_GCS_ISSUE_COMPOSE_REQUEST, false); + + if (context->getConfigRef().has("S3_LOCAL_CACHE_ENABLE")) + { + LOG_WARNING(&Poco::Logger::get("S3Config"), "Config {} has deprecated.", S3_LOCAL_CACHE_ENABLE); + + config.s3_local_cache_enabled = context->getConfigRef().getBool(S3_LOCAL_CACHE_ENABLE, false); + config.s3_local_cache_max_size = context->getConfigRef().getUInt64(S3_LOCAL_CACHE_MAX_SIZE, 100_GiB); + config.s3_local_cache_cache_path = context->getConfigRef().getString(S3_LOCAL_CACHE_CACHE_PATH, ""); + config.s3_gcs_issue_compose_request = context->getConfigRef().getBool(S3_GCS_ISSUE_COMPOSE_REQUEST, false); + } + return config; } }; @@ -151,16 +184,29 @@ struct MergeTreeConfig inline static const String TABLE_PART_METADATA_CACHE_MAX_COUNT = "table_part_metadata_cache_max_count"; inline static const String TABLE_METADATA_CACHE_MAX_COUNT = "table_metadata_cache_max_count"; - size_t table_part_metadata_cache_max_count = 1000; - size_t table_metadata_cache_max_count = 100; + size_t table_part_metadata_cache_max_count = 5000; + size_t table_metadata_cache_max_count = 500; static MergeTreeConfig loadFromContext(DB::ContextPtr context) { MergeTreeConfig config; - config.table_part_metadata_cache_max_count = context->getConfigRef().getUInt64(TABLE_PART_METADATA_CACHE_MAX_COUNT, 1000); - config.table_metadata_cache_max_count = context->getConfigRef().getUInt64(TABLE_METADATA_CACHE_MAX_COUNT, 100); + config.table_part_metadata_cache_max_count = context->getConfigRef().getUInt64(TABLE_PART_METADATA_CACHE_MAX_COUNT, 5000); + config.table_metadata_cache_max_count = context->getConfigRef().getUInt64(TABLE_METADATA_CACHE_MAX_COUNT, 500); return config; } }; -} +struct GlutenJobSchedulerConfig +{ + inline static const String JOB_SCHEDULER_MAX_THREADS = "job_scheduler_max_threads"; + + size_t job_scheduler_max_threads = 10; + + static GlutenJobSchedulerConfig loadFromContext(DB::ContextPtr context) + { + GlutenJobSchedulerConfig config; + config.job_scheduler_max_threads = context->getConfigRef().getUInt64(JOB_SCHEDULER_MAX_THREADS, 10); + return config; + } +}; +} diff --git a/cpp-ch/local-engine/Common/GlutenStringUtils.cpp b/cpp-ch/local-engine/Common/GlutenStringUtils.cpp index b6d11ac1b267e..4a18f4ceda02e 100644 --- a/cpp-ch/local-engine/Common/GlutenStringUtils.cpp +++ b/cpp-ch/local-engine/Common/GlutenStringUtils.cpp @@ -14,10 +14,11 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -#include "GlutenStringUtils.h" -#include #include #include +#include + +#include "GlutenStringUtils.h" namespace local_engine { @@ -27,16 +28,46 @@ PartitionValues GlutenStringUtils::parsePartitionTablePath(const std::string & f Poco::StringTokenizer path(file, "/"); for (const auto & item : path) { - auto position = item.find('='); - if (position != std::string::npos) + auto pos = item.find('='); + if (pos != std::string::npos) { - result.emplace_back(PartitionValue(boost::algorithm::to_lower_copy(item.substr(0, position)), item.substr(position + 1))); + auto key = boost::to_lower_copy(item.substr(0, pos)); + auto value = item.substr(pos + 1); + + std::string unescaped_key; + std::string unescaped_value; + Poco::URI::decode(key, unescaped_key); + Poco::URI::decode(value, unescaped_value); + result.emplace_back(std::move(unescaped_key), std::move(unescaped_value)); } } return result; } + bool GlutenStringUtils::isNullPartitionValue(const std::string & value) { return value == "__HIVE_DEFAULT_PARTITION__"; } + +std::string GlutenStringUtils::dumpPartitionValue(const PartitionValue & value) +{ + return value.first + "=" + value.second; +} + +std::string GlutenStringUtils::dumpPartitionValues(const PartitionValues & values) +{ + std::string res; + res += "["; + + for (size_t i = 0; i < values.size(); ++i) + { + if (i) + res += ", "; + res += dumpPartitionValue(values[i]); + } + + res += "]"; + return res; +} + } diff --git a/cpp-ch/local-engine/Common/GlutenStringUtils.h b/cpp-ch/local-engine/Common/GlutenStringUtils.h index 023cb2b8d047f..dd044135320fb 100644 --- a/cpp-ch/local-engine/Common/GlutenStringUtils.h +++ b/cpp-ch/local-engine/Common/GlutenStringUtils.h @@ -28,5 +28,8 @@ class GlutenStringUtils public: static PartitionValues parsePartitionTablePath(const std::string & file); static bool isNullPartitionValue(const std::string & value); + + static std::string dumpPartitionValue(const PartitionValue & value); + static std::string dumpPartitionValues(const PartitionValues & values); }; } diff --git a/cpp-ch/local-engine/Common/QueryContext.cpp b/cpp-ch/local-engine/Common/QueryContext.cpp index 2d5780a6e35b2..ff9c151159a6d 100644 --- a/cpp-ch/local-engine/Common/QueryContext.cpp +++ b/cpp-ch/local-engine/Common/QueryContext.cpp @@ -16,8 +16,6 @@ */ #include "QueryContext.h" -#include - #include #include #include @@ -79,14 +77,19 @@ int64_t QueryContextManager::initializeQuery() DB::ContextMutablePtr QueryContextManager::currentQueryContext() { - if (!CurrentThread::getGroup()) - { - throw DB::Exception(ErrorCodes::LOGICAL_ERROR, "Thread group not found."); - } + auto thread_group = currentThreadGroup(); int64_t id = reinterpret_cast(CurrentThread::getGroup().get()); return query_map.get(id)->query_context; } +std::shared_ptr QueryContextManager::currentThreadGroup() +{ + if (auto thread_group = CurrentThread::getGroup()) + return thread_group; + + throw Exception(ErrorCodes::LOGICAL_ERROR, "Thread group not found."); +} + void QueryContextManager::logCurrentPerformanceCounters(ProfileEvents::Counters & counters) { if (!CurrentThread::getGroup()) diff --git a/cpp-ch/local-engine/Common/QueryContext.h b/cpp-ch/local-engine/Common/QueryContext.h index 0fbf4977321f1..4770327d1715c 100644 --- a/cpp-ch/local-engine/Common/QueryContext.h +++ b/cpp-ch/local-engine/Common/QueryContext.h @@ -30,6 +30,7 @@ class QueryContextManager } int64_t initializeQuery(); DB::ContextMutablePtr currentQueryContext(); + static std::shared_ptr currentThreadGroup(); void logCurrentPerformanceCounters(ProfileEvents::Counters& counters); size_t currentPeakMemory(int64_t id); void finalizeQuery(int64_t id); diff --git a/cpp-ch/local-engine/Functions/SparkFunctionArraySort.cpp b/cpp-ch/local-engine/Functions/SparkFunctionArraySort.cpp index 1371ec60e1796..cf9d67f1696b5 100644 --- a/cpp-ch/local-engine/Functions/SparkFunctionArraySort.cpp +++ b/cpp-ch/local-engine/Functions/SparkFunctionArraySort.cpp @@ -60,7 +60,7 @@ struct LambdaLess auto compare_res_col = lambda_->reduce(); DB::Field field; compare_res_col.column->get(0, field); - return field.get() < 0; + return field.safeGet() < 0; } private: ALWAYS_INLINE DB::ColumnPtr oneRowColumn(size_t i) const diff --git a/cpp-ch/local-engine/Functions/SparkFunctionCheckDecimalOverflow.h b/cpp-ch/local-engine/Functions/SparkFunctionCheckDecimalOverflow.h index 32bf79a563a70..e501c7fc5ffbd 100644 --- a/cpp-ch/local-engine/Functions/SparkFunctionCheckDecimalOverflow.h +++ b/cpp-ch/local-engine/Functions/SparkFunctionCheckDecimalOverflow.h @@ -50,17 +50,17 @@ template Field convertNumericType(const Field & from) { if (from.getType() == Field::Types::UInt64) - return convertNumericTypeImpl(from.get()); + return convertNumericTypeImpl(from.safeGet()); if (from.getType() == Field::Types::Int64) - return convertNumericTypeImpl(from.get()); + return convertNumericTypeImpl(from.safeGet()); if (from.getType() == Field::Types::UInt128) - return convertNumericTypeImpl(from.get()); + return convertNumericTypeImpl(from.safeGet()); if (from.getType() == Field::Types::Int128) - return convertNumericTypeImpl(from.get()); + return convertNumericTypeImpl(from.safeGet()); if (from.getType() == Field::Types::UInt256) - return convertNumericTypeImpl(from.get()); + return convertNumericTypeImpl(from.safeGet()); if (from.getType() == Field::Types::Int256) - return convertNumericTypeImpl(from.get()); + return convertNumericTypeImpl(from.safeGet()); throw Exception(ErrorCodes::TYPE_MISMATCH, "Type mismatch. Expected: Integer. Got: {}", from.getType()); } @@ -81,7 +81,7 @@ inline UInt32 extractArgument(const ColumnWithTypeAndName & named_column) throw Exception( ErrorCodes::DECIMAL_OVERFLOW, "{} convert overflow, precision/scale value must in UInt32", named_column.type->getName()); } - return static_cast(to.get()); + return static_cast(to.safeGet()); } } diff --git a/cpp-ch/local-engine/Functions/SparkFunctionFloor.h b/cpp-ch/local-engine/Functions/SparkFunctionFloor.h index ce33d11dbd8cf..4a3f99a9a3567 100644 --- a/cpp-ch/local-engine/Functions/SparkFunctionFloor.h +++ b/cpp-ch/local-engine/Functions/SparkFunctionFloor.h @@ -197,7 +197,7 @@ class SparkFunctionFloor : public DB::FunctionFloor if (scale_field.getType() != Field::Types::UInt64 && scale_field.getType() != Field::Types::Int64) throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Scale argument for rounding functions must have integer type"); - Int64 scale64 = scale_field.get(); + Int64 scale64 = scale_field.safeGet(); if (scale64 > std::numeric_limits::max() || scale64 < std::numeric_limits::min()) throw Exception(ErrorCodes::ARGUMENT_OUT_OF_BOUND, "Scale argument for rounding function is too large"); diff --git a/cpp-ch/local-engine/Functions/SparkFunctionHashingExtended.h b/cpp-ch/local-engine/Functions/SparkFunctionHashingExtended.h index 57bf00ba99044..c6499031492ed 100644 --- a/cpp-ch/local-engine/Functions/SparkFunctionHashingExtended.h +++ b/cpp-ch/local-engine/Functions/SparkFunctionHashingExtended.h @@ -101,42 +101,42 @@ class SparkFunctionAnyHash : public IFunction if (which.isNothing()) return seed; else if (which.isUInt8()) - return applyNumber(field.get(), seed); + return applyNumber(field.safeGet(), seed); else if (which.isUInt16()) - return applyNumber(field.get(), seed); + return applyNumber(field.safeGet(), seed); else if (which.isUInt32()) - return applyNumber(field.get(), seed); + return applyNumber(field.safeGet(), seed); else if (which.isUInt64()) - return applyNumber(field.get(), seed); + return applyNumber(field.safeGet(), seed); else if (which.isInt8()) - return applyNumber(field.get(), seed); + return applyNumber(field.safeGet(), seed); else if (which.isInt16()) - return applyNumber(field.get(), seed); + return applyNumber(field.safeGet(), seed); else if (which.isInt32()) - return applyNumber(field.get(), seed); + return applyNumber(field.safeGet(), seed); else if (which.isInt64()) - return applyNumber(field.get(), seed); + return applyNumber(field.safeGet(), seed); else if (which.isFloat32()) - return applyNumber(field.get(), seed); + return applyNumber(field.safeGet(), seed); else if (which.isFloat64()) - return applyNumber(field.get(), seed); + return applyNumber(field.safeGet(), seed); else if (which.isDate()) - return applyNumber(field.get(), seed); + return applyNumber(field.safeGet(), seed); else if (which.isDate32()) - return applyNumber(field.get(), seed); + return applyNumber(field.safeGet(), seed); else if (which.isDateTime()) - return applyNumber(field.get(), seed); + return applyNumber(field.safeGet(), seed); else if (which.isDateTime64()) - return applyDecimal(field.get(), seed); + return applyDecimal(field.safeGet(), seed); else if (which.isDecimal32()) - return applyDecimal(field.get(), seed); + return applyDecimal(field.safeGet(), seed); else if (which.isDecimal64()) - return applyDecimal(field.get(), seed); + return applyDecimal(field.safeGet(), seed); else if (which.isDecimal128()) - return applyDecimal(field.get(), seed); + return applyDecimal(field.safeGet(), seed); else if (which.isStringOrFixedString()) { - const String & str = field.get(); + const String & str = field.safeGet(); return applyUnsafeBytes(str.data(), str.size(), seed); } else if (which.isTuple()) @@ -145,7 +145,7 @@ class SparkFunctionAnyHash : public IFunction assert(tuple_type); const auto & elements = tuple_type->getElements(); - const Tuple & tuple = field.get(); + const Tuple & tuple = field.safeGet(); assert(tuple.size() == elements.size()); for (size_t i = 0; i < elements.size(); ++i) @@ -160,7 +160,7 @@ class SparkFunctionAnyHash : public IFunction assert(array_type); const auto & nested_type = array_type->getNestedType(); - const Array & array = field.get(); + const Array & array = field.safeGet(); for (size_t i=0; i < array.size(); ++i) { seed = applyGeneric(array[i], seed, nested_type); diff --git a/cpp-ch/local-engine/Functions/SparkFunctionMakeDecimal.cpp b/cpp-ch/local-engine/Functions/SparkFunctionMakeDecimal.cpp index 231856b0288f5..795e2b0be329a 100644 --- a/cpp-ch/local-engine/Functions/SparkFunctionMakeDecimal.cpp +++ b/cpp-ch/local-engine/Functions/SparkFunctionMakeDecimal.cpp @@ -205,7 +205,7 @@ namespace else return false; } - result = static_cast(convert_to.get()); + result = static_cast(convert_to.safeGet()); ToNativeType pow10 = intExp10OfSize(precision_value); if ((result < 0 && result <= -pow10) || result >= pow10) diff --git a/cpp-ch/local-engine/Functions/SparkFunctionRoundHalfUp.h b/cpp-ch/local-engine/Functions/SparkFunctionRoundHalfUp.h index 441842d4e7e1a..0bd28b116d9aa 100644 --- a/cpp-ch/local-engine/Functions/SparkFunctionRoundHalfUp.h +++ b/cpp-ch/local-engine/Functions/SparkFunctionRoundHalfUp.h @@ -271,7 +271,7 @@ class FunctionRoundingHalfUp : public IFunction if (scale_field.getType() != Field::Types::UInt64 && scale_field.getType() != Field::Types::Int64) throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Scale argument for rounding functions must have integer type"); - Int64 scale64 = scale_field.get(); + Int64 scale64 = scale_field.safeGet(); if (scale64 > std::numeric_limits::max() || scale64 < std::numeric_limits::min()) throw Exception(ErrorCodes::ARGUMENT_OUT_OF_BOUND, "Scale argument for rounding function is too large"); diff --git a/cpp-ch/local-engine/Functions/SparkFunctionToDateTime.h b/cpp-ch/local-engine/Functions/SparkFunctionToDateTime.h index 980af85bd9838..aab8aabc3a8d3 100644 --- a/cpp-ch/local-engine/Functions/SparkFunctionToDateTime.h +++ b/cpp-ch/local-engine/Functions/SparkFunctionToDateTime.h @@ -128,7 +128,7 @@ class SparkFunctionConvertToDateTime : public IFunction Field field; named_column.column->get(0, field); - return static_cast(field.get()); + return static_cast(field.safeGet()); } DB::DataTypePtr getReturnTypeImpl(const ColumnsWithTypeAndName & arguments) const override diff --git a/cpp-ch/local-engine/Join/BroadCastJoinBuilder.cpp b/cpp-ch/local-engine/Join/BroadCastJoinBuilder.cpp index f47f423df89b3..da301dcb89f87 100644 --- a/cpp-ch/local-engine/Join/BroadCastJoinBuilder.cpp +++ b/cpp-ch/local-engine/Join/BroadCastJoinBuilder.cpp @@ -57,13 +57,26 @@ jlong callJavaGet(const std::string & id) DB::Block resetBuildTableBlockName(Block & block, bool only_one = false) { DB::ColumnsWithTypeAndName new_cols; + std::set names; + int32_t seq = 0; for (const auto & col : block) { - // Add a prefix to avoid column name conflicts with left table. - new_cols.emplace_back(col.column, col.type, BlockUtil::RIHGT_COLUMN_PREFIX + col.name); - - if (only_one) - break; + // Add a prefix to avoid column name conflicts with left table. + std::stringstream new_name; + // add a sequence to avoid duplicate name in some rare cases + if (names.find(col.name) == names.end()) + { + new_name << BlockUtil::RIHGT_COLUMN_PREFIX << col.name; + names.insert(col.name); + } + else + { + new_name << BlockUtil::RIHGT_COLUMN_PREFIX << (seq++) << "_" << col.name; + } + new_cols.emplace_back(col.column, col.type, new_name.str()); + + if (only_one) + break; } return DB::Block(new_cols); } diff --git a/cpp-ch/local-engine/Operator/ExpandStep.cpp b/cpp-ch/local-engine/Operator/ExpandStep.cpp index 9f56d9fd9460f..8770c4c405cc1 100644 --- a/cpp-ch/local-engine/Operator/ExpandStep.cpp +++ b/cpp-ch/local-engine/Operator/ExpandStep.cpp @@ -52,16 +52,18 @@ ExpandStep::ExpandStep(const DB::DataStream & input_stream_, const ExpandField & output_header = getOutputStream().header; } -DB::Block ExpandStep::buildOutputHeader(const DB::Block & , const ExpandField & project_set_exprs_) +DB::Block ExpandStep::buildOutputHeader(const DB::Block &, const ExpandField & project_set_exprs_) { DB::ColumnsWithTypeAndName cols; const auto & types = project_set_exprs_.getTypes(); const auto & names = project_set_exprs_.getNames(); + chassert(names.size() == types.size()); + for (size_t i = 0; i < project_set_exprs_.getExpandCols(); ++i) - cols.push_back(DB::ColumnWithTypeAndName(types[i], names[i])); + cols.emplace_back(DB::ColumnWithTypeAndName(types[i], names[i])); - return DB::Block(cols); + return DB::Block(std::move(cols)); } void ExpandStep::transformPipeline(DB::QueryPipelineBuilder & pipeline, const DB::BuildQueryPipelineSettings & /*settings*/) diff --git a/cpp-ch/local-engine/Operator/ExpandTransform.cpp b/cpp-ch/local-engine/Operator/ExpandTransform.cpp index f5787163c5a1e..29e254bc01a76 100644 --- a/cpp-ch/local-engine/Operator/ExpandTransform.cpp +++ b/cpp-ch/local-engine/Operator/ExpandTransform.cpp @@ -15,19 +15,20 @@ * limitations under the License. */ #include +#include #include #include #include #include #include +#include #include -#include "ExpandTransorm.h" - -#include #include #include +#include "ExpandTransorm.h" + namespace DB { namespace ErrorCodes @@ -93,53 +94,42 @@ void ExpandTransform::work() if (expand_expr_iterator >= project_set_exprs.getExpandRows()) throw DB::Exception(DB::ErrorCodes::LOGICAL_ERROR, "expand_expr_iterator >= project_set_exprs.getExpandRows()"); - const auto & original_cols = input_chunk.getColumns(); + const auto & input_header = getInputs().front().getHeader(); + const auto & input_columns = input_chunk.getColumns(); + const auto & types = project_set_exprs.getTypes(); + const auto & kinds = project_set_exprs.getKinds()[expand_expr_iterator]; + const auto & fields = project_set_exprs.getFields()[expand_expr_iterator]; size_t rows = input_chunk.getNumRows(); - DB::Columns cols; - for (size_t j = 0; j < project_set_exprs.getExpandCols(); ++j) + + DB::Columns columns(types.size()); + for (size_t col_i = 0; col_i < types.size(); ++col_i) { - const auto & type = project_set_exprs.getTypes()[j]; - const auto & kind = project_set_exprs.getKinds()[expand_expr_iterator][j]; - const auto & field = project_set_exprs.getFields()[expand_expr_iterator][j]; + const auto & type = types[col_i]; + const auto & kind = kinds[col_i]; + const auto & field = fields[col_i]; if (kind == EXPAND_FIELD_KIND_SELECTION) { - const auto & original_col = original_cols.at(field.get()); - if (type->isNullable() == original_col->isNullable()) - { - cols.push_back(original_col); - } - else if (type->isNullable() && !original_col->isNullable()) - { - auto null_map = DB::ColumnUInt8::create(rows, 0); - auto col = DB::ColumnNullable::create(original_col, std::move(null_map)); - cols.push_back(std::move(col)); - } - else - { - throw DB::Exception( - DB::ErrorCodes::LOGICAL_ERROR, - "Miss match nullable, column {} is nullable, but type {} is not nullable", - original_col->getName(), - type->getName()); - } + auto index = field.safeGet(); + const auto & input_column = input_columns[index]; + + DB::ColumnWithTypeAndName input_arg; + input_arg.column = input_column; + input_arg.type = input_header.getByPosition(index).type; + /// input_column maybe non-Nullable + columns[col_i] = DB::castColumn(input_arg, type); } - else if (field.isNull()) + else if (kind == EXPAND_FIELD_KIND_LITERAL) { - // Add null column - auto null_map = DB::ColumnUInt8::create(rows, 1); - auto nested_type = DB::removeNullable(type); - auto col = DB::ColumnNullable::create(nested_type->createColumn()->cloneResized(rows), std::move(null_map)); - cols.push_back(std::move(col)); + /// Add const column with field value + auto column = type->createColumnConst(rows, field); + columns[col_i] = column; } else - { - // Add constant column: gid, gpos, etc. - auto col = type->createColumnConst(rows, field); - cols.push_back(col->convertToFullColumnIfConst()); - } + throw DB::Exception(DB::ErrorCodes::LOGICAL_ERROR, "Unknown ExpandFieldKind {}", magic_enum::enum_name(kind)); } - output_chunk = DB::Chunk(cols, rows); + + output_chunk = DB::Chunk(std::move(columns), rows); has_output = true; ++expand_expr_iterator; diff --git a/cpp-ch/local-engine/Operator/ExpandTransorm.h b/cpp-ch/local-engine/Operator/ExpandTransorm.h index 90bdf3dc13dc5..f315ca5db35ef 100644 --- a/cpp-ch/local-engine/Operator/ExpandTransorm.h +++ b/cpp-ch/local-engine/Operator/ExpandTransorm.h @@ -15,21 +15,21 @@ * limitations under the License. */ #pragma once -#include -#include + #include #include #include #include #include + namespace local_engine { // For handling substrait expand node. // The implementation in spark for groupingsets/rollup/cube is different from Clickhouse. -// We have to ways to support groupingsets/rollup/cube -// - rewrite the substrait plan in local engine and reuse the implementation of clickhouse. This +// We have two ways to support groupingsets/rollup/cube +// - Rewrite the substrait plan in local engine and reuse the implementation of clickhouse. This // may be more complex. -// - implement new transform to do the expandation. It's more simple, but may suffer some performance +// - Implement new transform to do the expandation. It's simpler, but may suffer some performance // issues. We try this first. class ExpandTransform : public DB::IProcessor { diff --git a/cpp-ch/local-engine/Parser/AdvancedParametersParseUtil.cpp b/cpp-ch/local-engine/Parser/AdvancedParametersParseUtil.cpp index a7a07c0bf31ae..42d4f4d4d8cdf 100644 --- a/cpp-ch/local-engine/Parser/AdvancedParametersParseUtil.cpp +++ b/cpp-ch/local-engine/Parser/AdvancedParametersParseUtil.cpp @@ -57,6 +57,24 @@ void tryAssign(const std::unordered_map & kvs, const Strin } } +template<> +void tryAssign(const std::unordered_map & kvs, const String & key, Int64 & v) +{ + auto it = kvs.find(key); + if (it != kvs.end()) + { + try + { + v = std::stol(it->second); + } + catch (...) + { + LOG_ERROR(getLogger("tryAssign"), "Invalid number: {}", it->second); + throw; + } + } +} + template void readStringUntilCharsInto(String & s, DB::ReadBuffer & buf) { @@ -121,6 +139,11 @@ JoinOptimizationInfo JoinOptimizationInfo::parse(const String & advance) tryAssign(kvs, "buildHashTableId", info.storage_join_key); tryAssign(kvs, "isNullAwareAntiJoin", info.is_null_aware_anti_join); tryAssign(kvs, "isExistenceJoin", info.is_existence_join); + tryAssign(kvs, "leftRowCount", info.left_table_rows); + tryAssign(kvs, "leftSizeInBytes", info.left_table_bytes); + tryAssign(kvs, "rightRowCount", info.right_table_rows); + tryAssign(kvs, "rightSizeInBytes", info.right_table_bytes); + tryAssign(kvs, "numPartitions", info.partitions_num); return info; } } diff --git a/cpp-ch/local-engine/Parser/AdvancedParametersParseUtil.h b/cpp-ch/local-engine/Parser/AdvancedParametersParseUtil.h index 5a15a3ea8abc6..5f6fe6d256e3c 100644 --- a/cpp-ch/local-engine/Parser/AdvancedParametersParseUtil.h +++ b/cpp-ch/local-engine/Parser/AdvancedParametersParseUtil.h @@ -29,6 +29,11 @@ struct JoinOptimizationInfo bool is_smj = false; bool is_null_aware_anti_join = false; bool is_existence_join = false; + Int64 left_table_rows = -1; + Int64 left_table_bytes = -1; + Int64 right_table_rows = -1; + Int64 right_table_bytes = -1; + Int64 partitions_num = -1; String storage_join_key; static JoinOptimizationInfo parse(const String & advance); diff --git a/cpp-ch/local-engine/Parser/AggregateFunctionParser.cpp b/cpp-ch/local-engine/Parser/AggregateFunctionParser.cpp index f976d50ad3b2f..b843d1565fcef 100644 --- a/cpp-ch/local-engine/Parser/AggregateFunctionParser.cpp +++ b/cpp-ch/local-engine/Parser/AggregateFunctionParser.cpp @@ -155,7 +155,7 @@ const DB::ActionsDAG::Node * AggregateFunctionParser::convertNodeTypeIfNeeded( if (need_convert_type) { func_node = ActionsDAGUtil::convertNodeType( - actions_dag, func_node, TypeParser::parseType(output_type)->getName(), func_node->result_name); + actions_dag, func_node, TypeParser::parseType(output_type), func_node->result_name); actions_dag.addOrReplaceInOutputs(*func_node); } diff --git a/cpp-ch/local-engine/Parser/AggregateRelParser.cpp b/cpp-ch/local-engine/Parser/AggregateRelParser.cpp index bf5129f13277f..494403cf4cd31 100644 --- a/cpp-ch/local-engine/Parser/AggregateRelParser.cpp +++ b/cpp-ch/local-engine/Parser/AggregateRelParser.cpp @@ -195,7 +195,8 @@ void AggregateRelParser::addPreProjection() void AggregateRelParser::buildAggregateDescriptions(AggregateDescriptions & descriptions) { - auto build_result_column_name = [](const String & function_name, const Array & params, const Strings & arg_names, substrait::AggregationPhase phase) + const auto & current_plan_header = plan->getCurrentDataStream().header; + auto build_result_column_name = [this, current_plan_header](const String & function_name, const Array & params, const Strings & arg_names, substrait::AggregationPhase phase) { if (phase == substrait::AggregationPhase::AGGREGATION_PHASE_INTERMEDIATE_TO_RESULT) { @@ -219,7 +220,12 @@ void AggregateRelParser::buildAggregateDescriptions(AggregateDescriptions & desc result += "("; result += boost::algorithm::join(arg_names, ","); result += ")"; - return result; + // Make the name unique to avoid name collision(issue #6878). + auto res = this->getUniqueName(result); + // Just a check for remining this issue. + if (current_plan_header.findByName(res)) + throw DB::Exception(DB::ErrorCodes::LOGICAL_ERROR, "Name ({}) collision in header: {}", res, current_plan_header.dumpStructure()); + return res; }; for (auto & agg_info : aggregates) @@ -228,6 +234,7 @@ void AggregateRelParser::buildAggregateDescriptions(AggregateDescriptions & desc const auto & measure = agg_info.measure->measure(); description.column_name = build_result_column_name(agg_info.function_name, agg_info.params, agg_info.arg_column_names, measure.phase()); + agg_info.measure_column_name = description.column_name; // std::cout << "description.column_name:" << description.column_name << std::endl; description.argument_names = agg_info.arg_column_names; diff --git a/cpp-ch/local-engine/Parser/CHColumnToSparkRow.cpp b/cpp-ch/local-engine/Parser/CHColumnToSparkRow.cpp index 5bb66e4b3f9dd..602cd3d6837e5 100644 --- a/cpp-ch/local-engine/Parser/CHColumnToSparkRow.cpp +++ b/cpp-ch/local-engine/Parser/CHColumnToSparkRow.cpp @@ -501,7 +501,7 @@ int64_t BackingDataLengthCalculator::calculate(const Field & field) const if (which.isStringOrFixedString()) { - const auto & str = field.get(); + const auto & str = field.safeGet(); return roundNumberOfBytesToNearestWord(str.size()); } @@ -511,7 +511,7 @@ int64_t BackingDataLengthCalculator::calculate(const Field & field) const if (which.isArray()) { /// 内存布局:numElements(8B) | null_bitmap(与numElements成正比) | values(每个值长度与类型有关) | backing buffer - const auto & array = field.get(); /// Array can not be wrapped with Nullable + const auto & array = field.safeGet(); /// Array can not be wrapped with Nullable const auto num_elems = array.size(); int64_t res = 8 + calculateBitSetWidthInBytes(num_elems); @@ -531,7 +531,7 @@ int64_t BackingDataLengthCalculator::calculate(const Field & field) const int64_t res = 8; /// Construct Array of keys and values from Map - const auto & map = field.get(); /// Map can not be wrapped with Nullable + const auto & map = field.safeGet(); /// Map can not be wrapped with Nullable const auto num_keys = map.size(); auto array_key = Array(); auto array_val = Array(); @@ -539,7 +539,7 @@ int64_t BackingDataLengthCalculator::calculate(const Field & field) const array_val.reserve(num_keys); for (size_t i = 0; i < num_keys; ++i) { - const auto & pair = map[i].get(); + const auto & pair = map[i].safeGet(); array_key.push_back(pair[0]); array_val.push_back(pair[1]); } @@ -561,7 +561,7 @@ int64_t BackingDataLengthCalculator::calculate(const Field & field) const if (which.isTuple()) { /// 内存布局:null_bitmap(字节数与字段数成正比) | field1 value(8B) | field2 value(8B) | ... | fieldn value(8B) | backing buffer - const auto & tuple = field.get(); /// Tuple can not be wrapped with Nullable + const auto & tuple = field.safeGet(); /// Tuple can not be wrapped with Nullable const auto * type_tuple = typeid_cast(type_without_nullable.get()); const auto & type_fields = type_tuple->getElements(); const auto num_fields = type_fields.size(); @@ -586,12 +586,11 @@ int64_t BackingDataLengthCalculator::getArrayElementSize(const DataTypePtr & nes else if (nested_which.isUInt16() || nested_which.isInt16() || nested_which.isDate()) return 2; else if ( - nested_which.isUInt32() || nested_which.isInt32() || nested_which.isFloat32() || nested_which.isDate32() - || nested_which.isDecimal32()) + nested_which.isUInt32() || nested_which.isInt32() || nested_which.isFloat32() || nested_which.isDate32()) return 4; else if ( nested_which.isUInt64() || nested_which.isInt64() || nested_which.isFloat64() || nested_which.isDateTime64() - || nested_which.isDecimal64()) + || nested_which.isDecimal32() || nested_which.isDecimal64()) return 8; else return 8; @@ -688,24 +687,7 @@ int64_t VariableLengthDataWriter::writeArray(size_t row_idx, const DB::Array & a bitSet(buffer_address + offset + start + 8, i); else { - if (writer.getWhichDataType().isFloat32()) - { - // We can not use get() directly here to process Float32 field, - // because it will get 8 byte data, but Float32 is 4 byte, which will cause error conversion. - auto v = static_cast(elem.get()); - writer.unsafeWrite( - reinterpret_cast(&v), buffer_address + offset + start + 8 + len_null_bitmap + i * elem_size); - } - else if (writer.getWhichDataType().isFloat64()) - { - // Fix 'Invalid Field get from type Float64 to type Int64' in debug build. - auto v = elem.get(); - writer.unsafeWrite(reinterpret_cast(&v), buffer_address + offset + start + 8 + len_null_bitmap + i * elem_size); - } - else - writer.unsafeWrite( - reinterpret_cast(&elem.get()), - buffer_address + offset + start + 8 + len_null_bitmap + i * elem_size); + writer.write(elem, buffer_address + offset + start + 8 + len_null_bitmap + i * elem_size); } } } @@ -749,7 +731,7 @@ int64_t VariableLengthDataWriter::writeMap(size_t row_idx, const DB::Map & map, val_array.reserve(num_pairs); for (size_t i = 0; i < num_pairs; ++i) { - const auto & pair = map[i].get(); + const auto & pair = map[i].safeGet(); key_array.push_back(pair[0]); val_array.push_back(pair[1]); } @@ -807,27 +789,7 @@ int64_t VariableLengthDataWriter::writeStruct(size_t row_idx, const DB::Tuple & if (BackingDataLengthCalculator::isFixedLengthDataType(removeNullable(field_type))) { FixedLengthDataWriter writer(field_type); - if (writer.getWhichDataType().isFloat32()) - { - // We can not use get() directly here to process Float32 field, - // because it will get 8 byte data, but Float32 is 4 byte, which will cause error conversion. - auto v = static_cast(field_value.get()); - writer.unsafeWrite(reinterpret_cast(&v), buffer_address + offset + start + len_null_bitmap + i * 8); - } - else if (writer.getWhichDataType().isFloat64()) - { - // Fix 'Invalid Field get from type Float64 to type Int64' in debug build. - auto v = field_value.get(); - writer.unsafeWrite(reinterpret_cast(&v), buffer_address + offset + start + len_null_bitmap + i * 8); - } - else if (writer.getWhichDataType().isDecimal64() || writer.getWhichDataType().isDateTime64()) - { - auto v = field_value.get(); - writer.unsafeWrite(reinterpret_cast(&v), buffer_address + offset + start + len_null_bitmap + i * 8); - } - else - writer.unsafeWrite( - reinterpret_cast(&field_value.get()), buffer_address + offset + start + len_null_bitmap + i * 8); + writer.write(field_value, buffer_address + offset + start + len_null_bitmap + i * 8); } else { @@ -848,7 +810,7 @@ int64_t VariableLengthDataWriter::write(size_t row_idx, const DB::Field & field, if (which.isStringOrFixedString()) { - const auto & str = field.get(); + const auto & str = field.safeGet(); return writeUnalignedBytes(row_idx, str.data(), str.size(), parent_offset); } @@ -863,19 +825,19 @@ int64_t VariableLengthDataWriter::write(size_t row_idx, const DB::Field & field, if (which.isArray()) { - const auto & array = field.get(); + const auto & array = field.safeGet(); return writeArray(row_idx, array, parent_offset); } if (which.isMap()) { - const auto & map = field.get(); + const auto & map = field.safeGet(); return writeMap(row_idx, map, parent_offset); } if (which.isTuple()) { - const auto & tuple = field.get(); + const auto & tuple = field.safeGet(); return writeStruct(row_idx, tuple, parent_offset); } @@ -921,64 +883,64 @@ void FixedLengthDataWriter::write(const DB::Field & field, char * buffer) if (which.isUInt8()) { - const auto value = UInt8(field.get()); + const auto value = static_cast(field.safeGet()); memcpy(buffer, &value, 1); } else if (which.isUInt16() || which.isDate()) { - const auto value = UInt16(field.get()); + const auto value = static_cast(field.safeGet()); memcpy(buffer, &value, 2); } else if (which.isUInt32() || which.isDate32()) { - const auto value = UInt32(field.get()); + const auto value = static_cast(field.safeGet()); memcpy(buffer, &value, 4); } else if (which.isUInt64()) { - const auto & value = field.get(); + const auto & value = field.safeGet(); memcpy(buffer, &value, 8); } else if (which.isInt8()) { - const auto value = Int8(field.get()); + const auto value = static_cast(field.safeGet()); memcpy(buffer, &value, 1); } else if (which.isInt16()) { - const auto value = Int16(field.get()); + const auto value = static_cast(field.safeGet()); memcpy(buffer, &value, 2); } else if (which.isInt32()) { - const auto value = Int32(field.get()); + const auto value = static_cast(field.safeGet()); memcpy(buffer, &value, 4); } else if (which.isInt64()) { - const auto & value = field.get(); + const auto & value = field.safeGet(); memcpy(buffer, &value, 8); } else if (which.isFloat32()) { - const auto value = Float32(field.get()); + const auto value = static_cast(field.safeGet()); memcpy(buffer, &value, 4); } else if (which.isFloat64()) { - const auto & value = field.get(); + const auto & value = field.safeGet(); memcpy(buffer, &value, 8); } else if (which.isDecimal32()) { - const auto & value = field.get(); + const auto & value = field.safeGet(); const Int64 decimal = static_cast(value.getValue()); memcpy(buffer, &decimal, 8); } else if (which.isDecimal64() || which.isDateTime64()) { - const auto & value = field.get(); - auto decimal = value.getValue(); + const auto & value = field.safeGet(); + const auto decimal = value.getValue(); memcpy(buffer, &decimal, 8); } else diff --git a/cpp-ch/local-engine/Parser/FunctionParser.cpp b/cpp-ch/local-engine/Parser/FunctionParser.cpp index d46110431ab40..a875da275501e 100644 --- a/cpp-ch/local-engine/Parser/FunctionParser.cpp +++ b/cpp-ch/local-engine/Parser/FunctionParser.cpp @@ -80,8 +80,8 @@ const ActionsDAG::Node * FunctionParser::convertNodeTypeIfNeeded( actions_dag, func_node, // as stated in isTypeMatched, currently we don't change nullability of the result type - func_node->result_type->isNullable() ? local_engine::wrapNullableType(true, result_type)->getName() - : local_engine::removeNullable(result_type)->getName(), + func_node->result_type->isNullable() ? local_engine::wrapNullableType(true, result_type) + : local_engine::removeNullable(result_type), func_node->result_name, CastType::accurateOrNull); } @@ -91,8 +91,8 @@ const ActionsDAG::Node * FunctionParser::convertNodeTypeIfNeeded( actions_dag, func_node, // as stated in isTypeMatched, currently we don't change nullability of the result type - func_node->result_type->isNullable() ? local_engine::wrapNullableType(true, TypeParser::parseType(output_type))->getName() - : DB::removeNullable(TypeParser::parseType(output_type))->getName(), + func_node->result_type->isNullable() ? local_engine::wrapNullableType(true, TypeParser::parseType(output_type)) + : DB::removeNullable(TypeParser::parseType(output_type)), func_node->result_name); } } diff --git a/cpp-ch/local-engine/Parser/JoinRelParser.cpp b/cpp-ch/local-engine/Parser/JoinRelParser.cpp index 8fbaf2feca717..0446a397c0087 100644 --- a/cpp-ch/local-engine/Parser/JoinRelParser.cpp +++ b/cpp-ch/local-engine/Parser/JoinRelParser.cpp @@ -16,6 +16,8 @@ */ #include "JoinRelParser.h" +#include +#include #include #include #include @@ -25,15 +27,15 @@ #include #include #include -#include #include +#include #include #include #include #include #include #include -#include +#include #include @@ -42,20 +44,19 @@ namespace DB { namespace ErrorCodes { - extern const int LOGICAL_ERROR; - extern const int UNKNOWN_TYPE; - extern const int BAD_ARGUMENTS; +extern const int LOGICAL_ERROR; +extern const int UNKNOWN_TYPE; +extern const int BAD_ARGUMENTS; } } using namespace DB; namespace local_engine { -std::shared_ptr createDefaultTableJoin(substrait::JoinRel_JoinType join_type, bool is_existence_join) +std::shared_ptr createDefaultTableJoin(substrait::JoinRel_JoinType join_type, bool is_existence_join, ContextPtr & context) { - auto & global_context = SerializedPlanParser::global_context; auto table_join = std::make_shared( - global_context->getSettingsRef(), global_context->getGlobalTemporaryVolume(), global_context->getTempDataOnDisk()); + context->getSettingsRef(), context->getGlobalTemporaryVolume(), context->getTempDataOnDisk()); std::pair kind_and_strictness = JoinUtil::getJoinKindAndStrictness(join_type, is_existence_join); table_join->setKind(kind_and_strictness.first); @@ -98,7 +99,8 @@ DB::QueryPlanPtr JoinRelParser::parseOp(const substrait::Rel & rel, std::list JoinRelParser::extractTableSidesFromExpression(const substrait::Expression & expr, const DB::Block & left_header, const DB::Block & right_header) +std::unordered_set JoinRelParser::extractTableSidesFromExpression( + const substrait::Expression & expr, const DB::Block & left_header, const DB::Block & right_header) { std::unordered_set table_sides; if (expr.has_scalar_function()) @@ -169,8 +171,7 @@ void JoinRelParser::renamePlanColumns(DB::QueryPlan & left, DB::QueryPlan & righ storage_join.getRightSampleBlock().getColumnsWithTypeAndName(), ActionsDAG::MatchColumnsMode::Position); - QueryPlanStepPtr right_project_step = - std::make_unique(right.getCurrentDataStream(), std::move(right_project)); + QueryPlanStepPtr right_project_step = std::make_unique(right.getCurrentDataStream(), std::move(right_project)); right_project_step->setStepDescription("Rename Broadcast Table Name"); steps.emplace_back(right_project_step.get()); right.addStep(std::move(right_project_step)); @@ -193,12 +194,9 @@ void JoinRelParser::renamePlanColumns(DB::QueryPlan & left, DB::QueryPlan & righ } } ActionsDAG left_project = ActionsDAG::makeConvertingActions( - left.getCurrentDataStream().header.getColumnsWithTypeAndName(), - new_left_cols, - ActionsDAG::MatchColumnsMode::Position); + left.getCurrentDataStream().header.getColumnsWithTypeAndName(), new_left_cols, ActionsDAG::MatchColumnsMode::Position); - QueryPlanStepPtr left_project_step = - std::make_unique(left.getCurrentDataStream(), std::move(left_project)); + QueryPlanStepPtr left_project_step = std::make_unique(left.getCurrentDataStream(), std::move(left_project)); left_project_step->setStepDescription("Rename Left Table Name for broadcast join"); steps.emplace_back(left_project_step.get()); left.addStep(std::move(left_project_step)); @@ -206,16 +204,18 @@ void JoinRelParser::renamePlanColumns(DB::QueryPlan & left, DB::QueryPlan & righ DB::QueryPlanPtr JoinRelParser::parseJoin(const substrait::JoinRel & join, DB::QueryPlanPtr left, DB::QueryPlanPtr right) { + auto join_config = JoinConfig::loadFromContext(getContext()); google::protobuf::StringValue optimization_info; optimization_info.ParseFromString(join.advanced_extension().optimization().value()); auto join_opt_info = JoinOptimizationInfo::parse(optimization_info.value()); + LOG_DEBUG(getLogger("JoinRelParser"), "optimization info:{}", optimization_info.value()); auto storage_join = join_opt_info.is_broadcast ? BroadCastJoinBuilder::getJoin(join_opt_info.storage_join_key) : nullptr; if (storage_join) { renamePlanColumns(*left, *right, *storage_join); } - auto table_join = createDefaultTableJoin(join.type(), join_opt_info.is_existence_join); + auto table_join = createDefaultTableJoin(join.type(), join_opt_info.is_existence_join, context); DB::Block right_header_before_convert_step = right->getCurrentDataStream().header; addConvertStep(*table_join, *left, *right); @@ -239,7 +239,9 @@ DB::QueryPlanPtr JoinRelParser::parseJoin(const substrait::JoinRel & join, DB::Q } if (is_col_names_changed) { - throw DB::Exception(DB::ErrorCodes::LOGICAL_ERROR, "For broadcast join, we must not change the columns name in the right table.\nleft header:{},\nright header: {} -> {}", + throw DB::Exception( + DB::ErrorCodes::LOGICAL_ERROR, + "For broadcast join, we must not change the columns name in the right table.\nleft header:{},\nright header: {} -> {}", left->getCurrentDataStream().header.dumpStructure(), right_header_before_convert_step.dumpStructure(), right->getCurrentDataStream().header.dumpStructure()); @@ -266,7 +268,6 @@ DB::QueryPlanPtr JoinRelParser::parseJoin(const substrait::JoinRel & join, DB::Q if (storage_join) { - applyJoinFilter(*table_join, join, *left, *right, true); auto broadcast_hash_join = storage_join->getJoinLocked(table_join, context); @@ -288,15 +289,13 @@ DB::QueryPlanPtr JoinRelParser::parseJoin(const substrait::JoinRel & join, DB::Q /// TODO: make smj support mixed conditions if (need_post_filter && table_join->kind() != DB::JoinKind::Inner) { - throw DB::Exception( - DB::ErrorCodes::LOGICAL_ERROR, - "Sort merge join doesn't support mixed join conditions, except inner join."); + throw DB::Exception(DB::ErrorCodes::LOGICAL_ERROR, "Sort merge join doesn't support mixed join conditions, except inner join."); } JoinPtr smj_join = std::make_shared(table_join, right->getCurrentDataStream().header.cloneEmpty(), -1); MultiEnum join_algorithm = context->getSettingsRef().join_algorithm; QueryPlanStepPtr join_step - = std::make_unique(left->getCurrentDataStream(), right->getCurrentDataStream(), smj_join, 8192, 1, false); + = std::make_unique(left->getCurrentDataStream(), right->getCurrentDataStream(), smj_join, 8192, 1, false); join_step->setStepDescription("SORT_MERGE_JOIN"); steps.emplace_back(join_step.get()); @@ -311,41 +310,22 @@ DB::QueryPlanPtr JoinRelParser::parseJoin(const substrait::JoinRel & join, DB::Q } else { - applyJoinFilter(*table_join, join, *left, *right, true); - - /// Following is some configurations for grace hash join. - /// - spark.gluten.sql.columnar.backend.ch.runtime_settings.join_algorithm=grace_hash. This will - /// enable grace hash join. - /// - spark.gluten.sql.columnar.backend.ch.runtime_settings.max_bytes_in_join=3145728. This setup - /// the memory limitation fro grace hash join. If the memory consumption exceeds the limitation, - /// data will be spilled to disk. Don't set the limitation too small, otherwise the buckets number - /// will be too large and the performance will be bad. - JoinPtr hash_join = nullptr; - MultiEnum join_algorithm = context->getSettingsRef().join_algorithm; - if (join_algorithm.isSet(DB::JoinAlgorithm::GRACE_HASH)) + std::vector join_on_clauses; + if (table_join->getClauses().empty()) + table_join->addDisjunct(); + bool is_multi_join_on_clauses + = couldRewriteToMultiJoinOnClauses(table_join->getOnlyClause(), join_on_clauses, join, left_header, right_header); + if (is_multi_join_on_clauses && join_config.prefer_multi_join_on_clauses && join_opt_info.right_table_rows > 0 + && join_opt_info.partitions_num > 0 + && join_opt_info.right_table_rows / join_opt_info.partitions_num + < join_config.multi_join_on_clauses_build_side_rows_limit) { - hash_join = std::make_shared( - context, - table_join, - left->getCurrentDataStream().header, - right->getCurrentDataStream().header, - context->getTempDataOnDisk()); + query_plan = buildMultiOnClauseHashJoin(table_join, std::move(left), std::move(right), join_on_clauses); } else { - hash_join = std::make_shared(table_join, right->getCurrentDataStream().header.cloneEmpty()); + query_plan = buildSingleOnClauseHashJoin(join, table_join, std::move(left), std::move(right)); } - QueryPlanStepPtr join_step - = std::make_unique(left->getCurrentDataStream(), right->getCurrentDataStream(), hash_join, 8192, 1, false); - - join_step->setStepDescription("HASH_JOIN"); - steps.emplace_back(join_step.get()); - std::vector plans; - plans.emplace_back(std::move(left)); - plans.emplace_back(std::move(right)); - - query_plan = std::make_unique(); - query_plan->unitePlans(std::move(join_step), {std::move(plans)}); } JoinUtil::reorderJoinOutput(*query_plan, after_join_names); @@ -508,7 +488,11 @@ void JoinRelParser::collectJoinKeys( } bool JoinRelParser::applyJoinFilter( - DB::TableJoin & table_join, const substrait::JoinRel & join_rel, DB::QueryPlan & left, DB::QueryPlan & right, bool allow_mixed_condition) + DB::TableJoin & table_join, + const substrait::JoinRel & join_rel, + DB::QueryPlan & left, + DB::QueryPlan & right, + bool allow_mixed_condition) { if (!join_rel.has_post_join_filter()) return true; @@ -593,12 +577,14 @@ bool JoinRelParser::applyJoinFilter( if (!allow_mixed_condition) return false; auto mixed_join_expressions_actions = expressionsToActionsDAG({expr}, mixed_header); - table_join.getMixedJoinExpression() - = std::make_shared(std::move(mixed_join_expressions_actions), ExpressionActionsSettings::fromContext(context)); + mixed_join_expressions_actions.removeUnusedActions(); + table_join.getMixedJoinExpression() = std::make_shared( + std::move(mixed_join_expressions_actions), ExpressionActionsSettings::fromContext(context)); } else { - throw DB::Exception(DB::ErrorCodes::LOGICAL_ERROR, "Not any table column is used in the join condition.\n{}", join_rel.DebugString()); + throw DB::Exception( + DB::ErrorCodes::LOGICAL_ERROR, "Not any table column is used in the join condition.\n{}", join_rel.DebugString()); } return true; } @@ -609,7 +595,7 @@ void JoinRelParser::addPostFilter(DB::QueryPlan & query_plan, const substrait::J ActionsDAG actions_dag{query_plan.getCurrentDataStream().header.getColumnsWithTypeAndName()}; if (!join.post_join_filter().has_scalar_function()) { - // It may be singular_or_list + // It may be singular_or_list auto * in_node = getPlanParser()->parseExpression(actions_dag, join.post_join_filter()); filter_name = in_node->result_name; } @@ -623,6 +609,214 @@ void JoinRelParser::addPostFilter(DB::QueryPlan & query_plan, const substrait::J query_plan.addStep(std::move(filter_step)); } +/// Only support following pattern: a1 = b1 or a2 = b2 or (a3 = b3 and a4 = b4) +bool JoinRelParser::couldRewriteToMultiJoinOnClauses( + const DB::TableJoin::JoinOnClause & prefix_clause, + std::vector & clauses, + const substrait::JoinRel & join_rel, + const DB::Block & left_header, + const DB::Block & right_header) +{ + /// There is only one join clause + if (!join_rel.has_post_join_filter()) + return false; + + const auto & filter_expr = join_rel.post_join_filter(); + std::list expression_stack; + expression_stack.push_back(&filter_expr); + + auto check_function = [&](const String function_name_, const substrait::Expression & e) + { + if (!e.has_scalar_function()) + { + return false; + } + auto function_name = parseFunctionName(e.scalar_function().function_reference(), e.scalar_function()); + return function_name.has_value() && *function_name == function_name_; + }; + + auto get_field_ref = [](const substrait::Expression & e) -> std::optional + { + if (e.has_selection() && e.selection().has_direct_reference() && e.selection().direct_reference().has_struct_field()) + { + return std::optional(e.selection().direct_reference().struct_field().field()); + } + return {}; + }; + + auto parse_join_keys = [&](const substrait::Expression & e) -> std::optional> + { + const auto & args = e.scalar_function().arguments(); + auto l_field_ref = get_field_ref(args[0].value()); + auto r_field_ref = get_field_ref(args[1].value()); + if (!l_field_ref.has_value() || !r_field_ref.has_value()) + return {}; + size_t l_pos = static_cast(*l_field_ref); + size_t r_pos = static_cast(*r_field_ref); + size_t l_cols = left_header.columns(); + size_t total_cols = l_cols + right_header.columns(); + + if (l_pos < l_cols && r_pos >= l_cols && r_pos < total_cols) + return std::make_pair(left_header.getByPosition(l_pos).name, right_header.getByPosition(r_pos - l_cols).name); + else if (r_pos < l_cols && l_pos >= l_cols && l_pos < total_cols) + return std::make_pair(left_header.getByPosition(r_pos).name, right_header.getByPosition(l_pos - l_cols).name); + return {}; + }; + + auto parse_and_expression = [&](const substrait::Expression & e, DB::TableJoin::JoinOnClause & join_on_clause) + { + std::vector and_expression_stack; + and_expression_stack.push_back(&e); + while (!and_expression_stack.empty()) + { + const auto & current_expr = *(and_expression_stack.back()); + and_expression_stack.pop_back(); + if (check_function("and", current_expr)) + { + for (const auto & arg : e.scalar_function().arguments()) + and_expression_stack.push_back(&arg.value()); + } + else if (check_function("equals", current_expr)) + { + auto optional_keys = parse_join_keys(current_expr); + if (!optional_keys) + { + LOG_ERROR(getLogger("JoinRelParser"), "Not equal comparison for keys from both tables"); + return false; + } + join_on_clause.addKey(optional_keys->first, optional_keys->second, false); + } + else + { + LOG_ERROR(getLogger("JoinRelParser"), "And or equals function is expected"); + return false; + } + } + return true; + }; + + while (!expression_stack.empty()) + { + const auto & current_expr = *(expression_stack.back()); + expression_stack.pop_back(); + if (!check_function("or", current_expr)) + { + LOG_ERROR(getLogger("JoinRelParser"), "Not an or expression"); + } + + auto get_current_join_on_clause = [&]() + { + DB::TableJoin::JoinOnClause new_clause = prefix_clause; + clauses.push_back(new_clause); + return &clauses.back(); + }; + + const auto & args = current_expr.scalar_function().arguments(); + for (const auto & arg : args) + { + if (check_function("equals", arg.value())) + { + auto optional_keys = parse_join_keys(arg.value()); + if (!optional_keys) + { + LOG_ERROR(getLogger("JoinRelParser"), "Not equal comparison for keys from both tables"); + return false; + } + get_current_join_on_clause()->addKey(optional_keys->first, optional_keys->second, false); + } + else if (check_function("and", arg.value())) + { + if (!parse_and_expression(arg.value(), *get_current_join_on_clause())) + { + LOG_ERROR(getLogger("JoinRelParser"), "Parse and expression failed"); + return false; + } + } + else if (check_function("or", arg.value())) + { + expression_stack.push_back(&arg.value()); + } + else + { + LOG_ERROR(getLogger("JoinRelParser"), "Unknow function"); + return false; + } + } + } + return true; +} + + +DB::QueryPlanPtr JoinRelParser::buildMultiOnClauseHashJoin( + std::shared_ptr table_join, + DB::QueryPlanPtr left_plan, + DB::QueryPlanPtr right_plan, + const std::vector & join_on_clauses) +{ + DB::TableJoin::JoinOnClause & base_join_on_clause = table_join->getOnlyClause(); + base_join_on_clause = join_on_clauses[0]; + for (size_t i = 1; i < join_on_clauses.size(); ++i) + { + table_join->addDisjunct(); + auto & join_on_clause = table_join->getClauses().back(); + join_on_clause = join_on_clauses[i]; + } + + LOG_INFO(getLogger("JoinRelParser"), "multi join on clauses:\n{}", DB::TableJoin::formatClauses(table_join->getClauses())); + + JoinPtr hash_join = std::make_shared(table_join, right_plan->getCurrentDataStream().header); + QueryPlanStepPtr join_step + = std::make_unique(left_plan->getCurrentDataStream(), right_plan->getCurrentDataStream(), hash_join, 8192, 1, false); + join_step->setStepDescription("Multi join on clause hash join"); + steps.emplace_back(join_step.get()); + std::vector plans; + plans.emplace_back(std::move(left_plan)); + plans.emplace_back(std::move(right_plan)); + auto query_plan = std::make_unique(); + query_plan->unitePlans(std::move(join_step), {std::move(plans)}); + return query_plan; +} + +DB::QueryPlanPtr JoinRelParser::buildSingleOnClauseHashJoin( + const substrait::JoinRel & join_rel, std::shared_ptr table_join, DB::QueryPlanPtr left_plan, DB::QueryPlanPtr right_plan) +{ + applyJoinFilter(*table_join, join_rel, *left_plan, *right_plan, true); + /// Following is some configurations for grace hash join. + /// - spark.gluten.sql.columnar.backend.ch.runtime_settings.join_algorithm=grace_hash. This will + /// enable grace hash join. + /// - spark.gluten.sql.columnar.backend.ch.runtime_settings.max_bytes_in_join=3145728. This setup + /// the memory limitation fro grace hash join. If the memory consumption exceeds the limitation, + /// data will be spilled to disk. Don't set the limitation too small, otherwise the buckets number + /// will be too large and the performance will be bad. + JoinPtr hash_join = nullptr; + MultiEnum join_algorithm = context->getSettingsRef().join_algorithm; + if (join_algorithm.isSet(DB::JoinAlgorithm::GRACE_HASH)) + { + hash_join = std::make_shared( + context, + table_join, + left_plan->getCurrentDataStream().header, + right_plan->getCurrentDataStream().header, + context->getTempDataOnDisk()); + } + else + { + hash_join = std::make_shared(table_join, right_plan->getCurrentDataStream().header.cloneEmpty()); + } + QueryPlanStepPtr join_step + = std::make_unique(left_plan->getCurrentDataStream(), right_plan->getCurrentDataStream(), hash_join, 8192, 1, false); + + join_step->setStepDescription("HASH_JOIN"); + steps.emplace_back(join_step.get()); + std::vector plans; + plans.emplace_back(std::move(left_plan)); + plans.emplace_back(std::move(right_plan)); + + auto query_plan = std::make_unique(); + query_plan->unitePlans(std::move(join_step), {std::move(plans)}); + return query_plan; +} + void registerJoinRelParser(RelParserFactory & factory) { auto builder = [](SerializedPlanParser * plan_paser) { return std::make_shared(plan_paser); }; diff --git a/cpp-ch/local-engine/Parser/JoinRelParser.h b/cpp-ch/local-engine/Parser/JoinRelParser.h index ee1155cb47128..7e43187be308b 100644 --- a/cpp-ch/local-engine/Parser/JoinRelParser.h +++ b/cpp-ch/local-engine/Parser/JoinRelParser.h @@ -19,6 +19,7 @@ #include #include #include +#include #include #include @@ -70,6 +71,24 @@ class JoinRelParser : public RelParser static std::unordered_set extractTableSidesFromExpression( const substrait::Expression & expr, const DB::Block & left_header, const DB::Block & right_header); + + bool couldRewriteToMultiJoinOnClauses( + const DB::TableJoin::JoinOnClause & prefix_clause, + std::vector & clauses, + const substrait::JoinRel & join_rel, + const DB::Block & left_header, + const DB::Block & right_header); + + DB::QueryPlanPtr buildMultiOnClauseHashJoin( + std::shared_ptr table_join, + DB::QueryPlanPtr left_plan, + DB::QueryPlanPtr right_plan, + const std::vector & join_on_clauses); + DB::QueryPlanPtr buildSingleOnClauseHashJoin( + const substrait::JoinRel & join_rel, + std::shared_ptr table_join, + DB::QueryPlanPtr left_plan, + DB::QueryPlanPtr right_plan); }; } diff --git a/cpp-ch/local-engine/Parser/RelMetric.cpp b/cpp-ch/local-engine/Parser/RelMetric.cpp index 7b8b4cfd95a85..e138642607c4e 100644 --- a/cpp-ch/local-engine/Parser/RelMetric.cpp +++ b/cpp-ch/local-engine/Parser/RelMetric.cpp @@ -15,15 +15,63 @@ * limitations under the License. */ #include "RelMetric.h" + #include #include #include +#include +#include using namespace rapidjson; +namespace ProfileEvents +{ +extern const Event FileSegmentWaitReadBufferMicroseconds; +extern const Event FileSegmentReadMicroseconds; +extern const Event FileSegmentCacheWriteMicroseconds; +extern const Event FileSegmentPredownloadMicroseconds; +extern const Event FileSegmentUsedBytes; + +extern const Event CachedReadBufferReadFromSourceMicroseconds; +extern const Event CachedReadBufferReadFromCacheMicroseconds; +extern const Event CachedReadBufferCacheWriteMicroseconds; +extern const Event CachedReadBufferReadFromSourceBytes; +extern const Event CachedReadBufferReadFromCacheBytes; +extern const Event CachedReadBufferCacheWriteBytes; +extern const Event CachedReadBufferCreateBufferMicroseconds; + +extern const Event CachedReadBufferReadFromCacheHits; +extern const Event CachedReadBufferReadFromCacheMisses; +} + namespace local_engine { +static void writeCacheHits(Writer & writer) +{ + const auto thread_group = QueryContextManager::currentThreadGroup(); + auto & counters = thread_group->performance_counters; + auto read_cache_hits = counters[ProfileEvents::CachedReadBufferReadFromCacheHits].load(); + auto miss_cache_hits = counters[ProfileEvents::CachedReadBufferReadFromCacheMisses].load(); + auto read_cache_bytes = counters[ProfileEvents::CachedReadBufferReadFromCacheBytes].load(); + auto read_miss_bytes = counters[ProfileEvents::CachedReadBufferReadFromSourceBytes].load(); + auto read_cache_millisecond = counters[ProfileEvents::CachedReadBufferReadFromCacheMicroseconds].load() / 1000; + auto miss_cache_millisecond = counters[ProfileEvents::CachedReadBufferReadFromSourceMicroseconds].load() / 1000; + + writer.Key("read_cache_hits"); + writer.Uint64(read_cache_hits); + writer.Key("miss_cache_hits"); + writer.Uint64(miss_cache_hits); + writer.Key("read_cache_bytes"); + writer.Uint64(read_cache_bytes); + writer.Key("read_miss_bytes"); + writer.Uint64(read_miss_bytes); + writer.Key("read_cache_millisecond"); + writer.Uint64(read_cache_millisecond); + writer.Key("miss_cache_millisecond"); + writer.Uint64(miss_cache_millisecond); +} + RelMetric::RelMetric(size_t id_, const String & name_, std::vector & steps_) : id(id_), name(name_), steps(steps_) { } @@ -117,7 +165,7 @@ void RelMetric::serialize(Writer & writer, bool) const } writer.EndArray(); - if (auto read_mergetree = dynamic_cast(step)) + if (auto read_mergetree = dynamic_cast(step)) { auto selected_marks_pk = read_mergetree->getAnalysisResult().selected_marks_pk; auto selected_marks = read_mergetree->getAnalysisResult().selected_marks; @@ -128,6 +176,11 @@ void RelMetric::serialize(Writer & writer, bool) const writer.Uint64(selected_marks); writer.Key("total_marks_pk"); writer.Uint64(total_marks_pk); + writeCacheHits(writer); + } + else if (dynamic_cast(step)) + { + writeCacheHits(writer); } writer.EndObject(); diff --git a/cpp-ch/local-engine/Parser/SerializedPlanParser.cpp b/cpp-ch/local-engine/Parser/SerializedPlanParser.cpp index d0924a745716f..297551bcccc2b 100644 --- a/cpp-ch/local-engine/Parser/SerializedPlanParser.cpp +++ b/cpp-ch/local-engine/Parser/SerializedPlanParser.cpp @@ -285,7 +285,10 @@ QueryPlanStepPtr SerializedPlanParser::parseReadRealWithLocalFile(const substrai if (rel.has_local_files()) local_files = rel.local_files(); else + { local_files = BinaryToMessage(split_infos.at(nextSplitInfoIndex())); + logDebugMessage(local_files, "local_files"); + } auto source = std::make_shared(context, header, local_files); auto source_pipe = Pipe(source); auto source_step = std::make_unique(context, std::move(source_pipe), "substrait local files"); @@ -496,7 +499,10 @@ QueryPlanPtr SerializedPlanParser::parseOp(const substrait::Rel & rel, std::list if (read.has_extension_table()) extension_table = read.extension_table(); else + { extension_table = BinaryToMessage(split_infos.at(nextSplitInfoIndex())); + logDebugMessage(extension_table, "extension_table"); + } MergeTreeRelParser mergeTreeParser(this, context); query_plan = mergeTreeParser.parseReadRel(std::make_unique(), read, extension_table); @@ -689,7 +695,7 @@ ActionsDAG::NodeRawConstPtrs SerializedPlanParser::parseArrayJoinWithDAG( /// pos = cast(arrayJoin(arg_not_null).1, "Int32") const auto * pos_node = add_tuple_element(array_join_node, 1); - pos_node = ActionsDAGUtil::convertNodeType(actions_dag, pos_node, "Int32"); + pos_node = ActionsDAGUtil::convertNodeType(actions_dag, pos_node, INT()); /// if is_map is false, output col = arrayJoin(arg_not_null).2 /// if is_map is true, output (key, value) = arrayJoin(arg_not_null).2 @@ -772,7 +778,7 @@ std::pair SerializedPlanParser::convertStructFieldType(const #define UINT_CONVERT(type_ptr, field, type_name) \ if ((type_ptr)->getTypeId() == TypeIndex::type_name) \ { \ - return {std::make_shared(), static_cast((field).get()) + 1}; \ + return {std::make_shared(), static_cast((field).safeGet()) + 1}; \ } auto type_id = type->getTypeId(); diff --git a/cpp-ch/local-engine/Parser/TypeParser.cpp b/cpp-ch/local-engine/Parser/TypeParser.cpp index 39d52131e4e58..269f35747552e 100644 --- a/cpp-ch/local-engine/Parser/TypeParser.cpp +++ b/cpp-ch/local-engine/Parser/TypeParser.cpp @@ -14,7 +14,6 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -#include #include #include #include diff --git a/cpp-ch/local-engine/Parser/WriteRelParser.cpp b/cpp-ch/local-engine/Parser/WriteRelParser.cpp index 9b6226adbed81..1a468a41eef27 100644 --- a/cpp-ch/local-engine/Parser/WriteRelParser.cpp +++ b/cpp-ch/local-engine/Parser/WriteRelParser.cpp @@ -137,12 +137,12 @@ void addSinkTransfrom(const DB::ContextPtr & context, const substrait::WriteRel DB::Field field_tmp_dir; if (!settings.tryGet(SPARK_TASK_WRITE_TMEP_DIR, field_tmp_dir)) throw DB::Exception(DB::ErrorCodes::LOGICAL_ERROR, "Write Pipeline need inject temp directory."); - const auto & tmp_dir = field_tmp_dir.get(); + const auto & tmp_dir = field_tmp_dir.safeGet(); DB::Field field_filename; if (!settings.tryGet(SPARK_TASK_WRITE_FILENAME, field_filename)) throw DB::Exception(DB::ErrorCodes::LOGICAL_ERROR, "Write Pipeline need inject file name."); - const auto & filename = field_filename.get(); + const auto & filename = field_filename.safeGet(); assert(write_rel.has_named_table()); const substrait::NamedObjectWrite & named_table = write_rel.named_table(); diff --git a/cpp-ch/local-engine/Parser/aggregate_function_parser/ApproxPercentileParser.cpp b/cpp-ch/local-engine/Parser/aggregate_function_parser/ApproxPercentileParser.cpp index 237da650c8e12..ceddbd2aef809 100644 --- a/cpp-ch/local-engine/Parser/aggregate_function_parser/ApproxPercentileParser.cpp +++ b/cpp-ch/local-engine/Parser/aggregate_function_parser/ApproxPercentileParser.cpp @@ -98,7 +98,7 @@ DB::Array ApproxPercentileParser::parseFunctionParameters( if (isArray(type2)) { /// Multiple percentages for quantilesGK - const Array & percentags = field2.get(); + const Array & percentags = field2.safeGet(); for (const auto & percentage : percentags) params.emplace_back(percentage); } diff --git a/cpp-ch/local-engine/Parser/aggregate_function_parser/BloomFilterAggParser.cpp b/cpp-ch/local-engine/Parser/aggregate_function_parser/BloomFilterAggParser.cpp index 8788abb6dcf79..10bf0b09482e0 100644 --- a/cpp-ch/local-engine/Parser/aggregate_function_parser/BloomFilterAggParser.cpp +++ b/cpp-ch/local-engine/Parser/aggregate_function_parser/BloomFilterAggParser.cpp @@ -63,8 +63,8 @@ DB::Array AggregateFunctionParserBloomFilterAgg::parseFunctionParameters( node->column->get(0, ret); return ret; }; - Int64 insert_num = get_parameter_field(arg_nodes[1], 1).get(); - Int64 bits_num = get_parameter_field(arg_nodes[2], 2).get(); + Int64 insert_num = get_parameter_field(arg_nodes[1], 1).safeGet(); + Int64 bits_num = get_parameter_field(arg_nodes[2], 2).safeGet(); // Delete all args except the first arg. arg_nodes.resize(1); diff --git a/cpp-ch/local-engine/Parser/aggregate_function_parser/LeadLagParser.cpp b/cpp-ch/local-engine/Parser/aggregate_function_parser/LeadLagParser.cpp index 6d0075705c444..536aec1b60f44 100644 --- a/cpp-ch/local-engine/Parser/aggregate_function_parser/LeadLagParser.cpp +++ b/cpp-ch/local-engine/Parser/aggregate_function_parser/LeadLagParser.cpp @@ -19,6 +19,7 @@ #include #include #include +#include #include namespace local_engine @@ -41,7 +42,7 @@ LeadParser::parseFunctionArguments(const CommonFunctionInfo & func_info, DB::Act node = ActionsDAGUtil::convertNodeType( actions_dag, &actions_dag.findInOutputs(arg0_col_name), - DB::makeNullable(arg0_col_type)->getName(), + DB::makeNullable(arg0_col_type), arg0_col_name); actions_dag.addOrReplaceInOutputs(*node); args.push_back(node); @@ -52,7 +53,7 @@ LeadParser::parseFunctionArguments(const CommonFunctionInfo & func_info, DB::Act } node = parseExpression(actions_dag, arg1); - node = ActionsDAGUtil::convertNodeType(actions_dag, node, DB::DataTypeInt64().getName()); + node = ActionsDAGUtil::convertNodeType(actions_dag, node, BIGINT()); actions_dag.addOrReplaceInOutputs(*node); args.push_back(node); @@ -84,7 +85,7 @@ LagParser::parseFunctionArguments(const CommonFunctionInfo & func_info, DB::Acti node = ActionsDAGUtil::convertNodeType( actions_dag, &actions_dag.findInOutputs(arg0_col_name), - DB::makeNullable(arg0_col_type)->getName(), + makeNullable(arg0_col_type), arg0_col_name); actions_dag.addOrReplaceInOutputs(*node); args.push_back(node); @@ -100,7 +101,7 @@ LagParser::parseFunctionArguments(const CommonFunctionInfo & func_info, DB::Acti auto real_field = 0 - literal_result.second.safeGet(); node = &actions_dag.addColumn(ColumnWithTypeAndName( literal_result.first->createColumnConst(1, real_field), literal_result.first, getUniqueName(toString(real_field)))); - node = ActionsDAGUtil::convertNodeType(actions_dag, node, DB::DataTypeInt64().getName()); + node = ActionsDAGUtil::convertNodeType(actions_dag, node, BIGINT()); actions_dag.addOrReplaceInOutputs(*node); args.push_back(node); diff --git a/cpp-ch/local-engine/Parser/aggregate_function_parser/NtileParser.cpp b/cpp-ch/local-engine/Parser/aggregate_function_parser/NtileParser.cpp index 62f83223c06f7..1a24e320609e7 100644 --- a/cpp-ch/local-engine/Parser/aggregate_function_parser/NtileParser.cpp +++ b/cpp-ch/local-engine/Parser/aggregate_function_parser/NtileParser.cpp @@ -32,7 +32,7 @@ NtileParser::parseFunctionArguments(const CommonFunctionInfo & func_info, DB::Ac auto [data_type, field] = parseLiteral(arg0.literal()); if (!(DB::WhichDataType(data_type).isInt32())) throw Exception(ErrorCodes::BAD_ARGUMENTS, "ntile's argument must be i32"); - Int32 field_index = static_cast(field.get()); + Int32 field_index = static_cast(field.safeGet()); // For CH, the data type of the args[0] must be the UInt32 const auto * index_node = addColumnToActionsDAG(actions_dag, std::make_shared(), field_index); args.emplace_back(index_node); diff --git a/cpp-ch/local-engine/Parser/scalar_function_parser/CommonScalarFunctionParser.cpp b/cpp-ch/local-engine/Parser/scalar_function_parser/CommonScalarFunctionParser.cpp index 0eab34ec99d94..5b110dddbca16 100644 --- a/cpp-ch/local-engine/Parser/scalar_function_parser/CommonScalarFunctionParser.cpp +++ b/cpp-ch/local-engine/Parser/scalar_function_parser/CommonScalarFunctionParser.cpp @@ -61,6 +61,7 @@ REGISTER_COMMON_SCALAR_FUNCTION_PARSER(GetTimestamp, get_timestamp, parseDateTim REGISTER_COMMON_SCALAR_FUNCTION_PARSER(Quarter, quarter, toQuarter); REGISTER_COMMON_SCALAR_FUNCTION_PARSER(ToUnixTimestamp, to_unix_timestamp, parseDateTimeInJodaSyntaxOrNull); +// math functions REGISTER_COMMON_SCALAR_FUNCTION_PARSER(Position, positive, identity); REGISTER_COMMON_SCALAR_FUNCTION_PARSER(Negative, negative, negate); REGISTER_COMMON_SCALAR_FUNCTION_PARSER(Pmod, pmod, pmod); @@ -107,6 +108,7 @@ REGISTER_COMMON_SCALAR_FUNCTION_PARSER(Rand, rand, randCanonical); REGISTER_COMMON_SCALAR_FUNCTION_PARSER(Bin, bin, sparkBin); REGISTER_COMMON_SCALAR_FUNCTION_PARSER(Rint, rint, sparkRint); +// string functions REGISTER_COMMON_SCALAR_FUNCTION_PARSER(Like, like, like); REGISTER_COMMON_SCALAR_FUNCTION_PARSER(NotLike, not_like, notLike); REGISTER_COMMON_SCALAR_FUNCTION_PARSER(StartsWith, starts_with, startsWithUTF8); @@ -131,6 +133,7 @@ REGISTER_COMMON_SCALAR_FUNCTION_PARSER(Conv, conv, sparkConv); REGISTER_COMMON_SCALAR_FUNCTION_PARSER(Uuid, uuid, generateUUIDv4); REGISTER_COMMON_SCALAR_FUNCTION_PARSER(Levenshtein, levenshtein, editDistanceUTF8); REGISTER_COMMON_SCALAR_FUNCTION_PARSER(Concat, concat, concat); +REGISTER_COMMON_SCALAR_FUNCTION_PARSER(FormatString, format_string, printf); REGISTER_COMMON_SCALAR_FUNCTION_PARSER(Crc32, crc32, CRC32); REGISTER_COMMON_SCALAR_FUNCTION_PARSER(Murmur3Hash, murmur3hash, sparkMurmurHash3_32); @@ -151,7 +154,6 @@ REGISTER_COMMON_SCALAR_FUNCTION_PARSER(FloorDatetime, floor_datetime, dateTrunc) REGISTER_COMMON_SCALAR_FUNCTION_PARSER(Floor, floor, sparkFloor); REGISTER_COMMON_SCALAR_FUNCTION_PARSER(MothsBetween, months_between, sparkMonthsBetween); - // array functions REGISTER_COMMON_SCALAR_FUNCTION_PARSER(Array, array, array); REGISTER_COMMON_SCALAR_FUNCTION_PARSER(Shuffle, shuffle, arrayShuffle); @@ -166,7 +168,6 @@ REGISTER_COMMON_SCALAR_FUNCTION_PARSER(MapKeys, map_keys, mapKeys); REGISTER_COMMON_SCALAR_FUNCTION_PARSER(MapValues, map_values, mapValues); REGISTER_COMMON_SCALAR_FUNCTION_PARSER(MapFromArrays, map_from_arrays, mapFromArrays); - // json functions REGISTER_COMMON_SCALAR_FUNCTION_PARSER(FlattenJsonStringOnRequired, flattenJSONStringOnRequired, flattenJSONStringOnRequired); REGISTER_COMMON_SCALAR_FUNCTION_PARSER(ToJson, to_json, toJSONString); diff --git a/cpp-ch/local-engine/Parser/scalar_function_parser/arrayHighOrderFunctions.cpp b/cpp-ch/local-engine/Parser/scalar_function_parser/arrayHighOrderFunctions.cpp index a475a1efb367d..aa82b33a7a3c8 100644 --- a/cpp-ch/local-engine/Parser/scalar_function_parser/arrayHighOrderFunctions.cpp +++ b/cpp-ch/local-engine/Parser/scalar_function_parser/arrayHighOrderFunctions.cpp @@ -23,6 +23,7 @@ #include #include #include +#include #include #include #include @@ -60,7 +61,7 @@ class ArrayFilter : public FunctionParser /// filter with index argument. const auto * range_end_node = toFunctionNode(actions_dag, "length", {toFunctionNode(actions_dag, "assumeNotNull", {parsed_args[0]})}); range_end_node = ActionsDAGUtil::convertNodeType( - actions_dag, range_end_node, "Nullable(Int32)", range_end_node->result_name); + actions_dag, range_end_node, makeNullable(INT()), range_end_node->result_name); const auto * index_array_node = toFunctionNode( actions_dag, "range", @@ -106,7 +107,7 @@ class ArrayTransform : public FunctionParser /// transform with index argument. const auto * range_end_node = toFunctionNode(actions_dag, "length", {toFunctionNode(actions_dag, "assumeNotNull", {parsed_args[0]})}); range_end_node = ActionsDAGUtil::convertNodeType( - actions_dag, range_end_node, "Nullable(Int32)", range_end_node->result_name); + actions_dag, range_end_node, makeNullable(INT()), range_end_node->result_name); const auto * index_array_node = toFunctionNode( actions_dag, "range", @@ -141,7 +142,7 @@ class ArrayAggregate : public FunctionParser parsed_args[1] = ActionsDAGUtil::convertNodeType( actions_dag, parsed_args[1], - function_type->getReturnType()->getName(), + function_type->getReturnType(), parsed_args[1]->result_name); } @@ -215,14 +216,14 @@ class ArraySort : public FunctionParser if (!var_expr.has_literal()) return false; auto [_, name] = plan_parser->parseLiteral(var_expr.literal()); - return var == name.get(); + return var == name.safeGet(); }; auto is_int_value = [&](const substrait::Expression & expr, Int32 val) { if (!expr.has_literal()) return false; auto [_, x] = plan_parser->parseLiteral(expr.literal()); - return val == x.get(); + return val == x.safeGet(); }; auto is_variable_null = [&](const substrait::Expression & expr, const String & var) { diff --git a/cpp-ch/local-engine/Parser/scalar_function_parser/arrayPosition.cpp b/cpp-ch/local-engine/Parser/scalar_function_parser/arrayPosition.cpp index 1fda3d8fa7536..b0ade35a35902 100644 --- a/cpp-ch/local-engine/Parser/scalar_function_parser/arrayPosition.cpp +++ b/cpp-ch/local-engine/Parser/scalar_function_parser/arrayPosition.cpp @@ -86,7 +86,7 @@ class FunctionParserArrayPosition : public FunctionParser DataTypePtr wrap_arr_nullable_type = wrapNullableType(true, ch_function_node->result_type); const auto * wrap_index_of_node = ActionsDAGUtil::convertNodeType( - actions_dag, ch_function_node, wrap_arr_nullable_type->getName(), ch_function_node->result_name); + actions_dag, ch_function_node, wrap_arr_nullable_type, ch_function_node->result_name); const auto * null_const_node = addColumnToActionsDAG(actions_dag, wrap_arr_nullable_type, Field{}); const auto * or_condition_node = toFunctionNode(actions_dag, "or", {arr_is_null_node, val_is_null_node}); diff --git a/cpp-ch/local-engine/Parser/scalar_function_parser/elt.cpp b/cpp-ch/local-engine/Parser/scalar_function_parser/elt.cpp index 992235cd9a0bb..accc6d418b9f9 100644 --- a/cpp-ch/local-engine/Parser/scalar_function_parser/elt.cpp +++ b/cpp-ch/local-engine/Parser/scalar_function_parser/elt.cpp @@ -74,7 +74,7 @@ class FunctionParserElt : public FunctionParser auto nullable_result_type = makeNullable(result_type); const auto * nullable_array_element_node = ActionsDAGUtil::convertNodeType( - actions_dag, array_element_node, nullable_result_type->getName(), array_element_node->result_name); + actions_dag, array_element_node, nullable_result_type, array_element_node->result_name); const auto * null_const_node = addColumnToActionsDAG(actions_dag, nullable_result_type, Field()); const auto * is_null_node = toFunctionNode(actions_dag, "isNull", {index_arg}); diff --git a/cpp-ch/local-engine/Parser/scalar_function_parser/findInset.cpp b/cpp-ch/local-engine/Parser/scalar_function_parser/findInset.cpp index ca9fb372c2fde..96fedc6fe6467 100644 --- a/cpp-ch/local-engine/Parser/scalar_function_parser/findInset.cpp +++ b/cpp-ch/local-engine/Parser/scalar_function_parser/findInset.cpp @@ -19,6 +19,7 @@ #include #include #include +#include #include namespace DB @@ -73,9 +74,9 @@ class FunctionParserFindInSet : public FunctionParser if (!str_is_nullable && !str_array_is_nullable) return convertNodeTypeIfNeeded(substrait_func, index_of_node, actions_dag); - auto nullable_result_type = makeNullable(std::make_shared()); + auto nullable_result_type = makeNullable(INT()); const auto * nullable_index_of_node = ActionsDAGUtil::convertNodeType( - actions_dag, index_of_node, nullable_result_type->getName(), index_of_node->result_name); + actions_dag, index_of_node, nullable_result_type, index_of_node->result_name); const auto * null_const_node = addColumnToActionsDAG(actions_dag, nullable_result_type, Field()); const auto * str_is_null_node = toFunctionNode(actions_dag, "isNull", {str_arg}); diff --git a/cpp-ch/local-engine/Parser/scalar_function_parser/lambdaFunction.cpp b/cpp-ch/local-engine/Parser/scalar_function_parser/lambdaFunction.cpp index 547ffd971fcd4..c2841564e8c3f 100644 --- a/cpp-ch/local-engine/Parser/scalar_function_parser/lambdaFunction.cpp +++ b/cpp-ch/local-engine/Parser/scalar_function_parser/lambdaFunction.cpp @@ -43,7 +43,7 @@ DB::NamesAndTypesList collectLambdaArguments(const SerializedPlanParser & plan_p && plan_parser_.getFunctionSignatureName(arg.value().scalar_function().function_reference()) == "namedlambdavariable") { auto [_, col_name_field] = plan_parser_.parseLiteral(arg.value().scalar_function().arguments()[0].value().literal()); - String col_name = col_name_field.get(); + String col_name = col_name_field.safeGet(); if (collected_names.contains(col_name)) { continue; @@ -187,7 +187,7 @@ class NamedLambdaVariable : public FunctionParser const DB::ActionsDAG::Node * parse(const substrait::Expression_ScalarFunction & substrait_func, DB::ActionsDAG & actions_dag) const override { auto [_, col_name_field] = parseLiteral(substrait_func.arguments()[0].value().literal()); - String col_name = col_name_field.get(); + String col_name = col_name_field.safeGet(); auto type = TypeParser::parseType(substrait_func.output_type()); const auto & inputs = actions_dag.getInputs(); diff --git a/cpp-ch/local-engine/Parser/scalar_function_parser/locate.cpp b/cpp-ch/local-engine/Parser/scalar_function_parser/locate.cpp index b948daeda0eaa..17115895eaff0 100644 --- a/cpp-ch/local-engine/Parser/scalar_function_parser/locate.cpp +++ b/cpp-ch/local-engine/Parser/scalar_function_parser/locate.cpp @@ -17,6 +17,7 @@ #include #include +#include #include namespace DB @@ -50,7 +51,7 @@ class FunctionParserLocate : public FunctionParser const auto * substr_arg = parsed_args[0]; const auto * str_arg = parsed_args[1]; - const auto * start_pos_arg = ActionsDAGUtil::convertNodeType(actions_dag, parsed_args[2], "Nullable(UInt32)"); + const auto * start_pos_arg = ActionsDAGUtil::convertNodeType(actions_dag, parsed_args[2], makeNullable(UINT())); const auto * is_start_pos_null_node = toFunctionNode(actions_dag, "isNull", {start_pos_arg}); const auto * const_1_node = addColumnToActionsDAG(actions_dag, std::make_shared(), 0); const auto * position_node = toFunctionNode(actions_dag, "positionUTF8Spark", {str_arg, substr_arg, start_pos_arg}); diff --git a/cpp-ch/local-engine/Parser/scalar_function_parser/repeat.cpp b/cpp-ch/local-engine/Parser/scalar_function_parser/repeat.cpp index ada91f8537fe4..74254911a0a0a 100644 --- a/cpp-ch/local-engine/Parser/scalar_function_parser/repeat.cpp +++ b/cpp-ch/local-engine/Parser/scalar_function_parser/repeat.cpp @@ -18,6 +18,7 @@ #include #include #include +#include #include #include @@ -42,8 +43,7 @@ class SparkFunctionRepeatParser : public FunctionParser const auto & args = substrait_func.arguments(); parsed_args.emplace_back(parseExpression(actions_dag, args[0].value())); const auto * repeat_times_node = parseExpression(actions_dag, args[1].value()); - DB::DataTypeNullable target_type(std::make_shared()); - repeat_times_node = ActionsDAGUtil::convertNodeType(actions_dag, repeat_times_node, target_type.getName()); + repeat_times_node = ActionsDAGUtil::convertNodeType(actions_dag, repeat_times_node, makeNullable(UINT())); parsed_args.emplace_back(repeat_times_node); const auto * func_node = toFunctionNode(actions_dag, ch_function_name, parsed_args); return convertNodeTypeIfNeeded(substrait_func, func_node, actions_dag); diff --git a/cpp-ch/local-engine/Parser/scalar_function_parser/slice.cpp b/cpp-ch/local-engine/Parser/scalar_function_parser/slice.cpp index 2643207354ae7..a96dca8efe4dd 100644 --- a/cpp-ch/local-engine/Parser/scalar_function_parser/slice.cpp +++ b/cpp-ch/local-engine/Parser/scalar_function_parser/slice.cpp @@ -89,7 +89,7 @@ class FunctionParserArraySlice : public FunctionParser DataTypePtr wrap_arr_nullable_type = wrapNullableType(true, slice_node->result_type); const auto * wrap_slice_node = ActionsDAGUtil::convertNodeType( - actions_dag, slice_node, wrap_arr_nullable_type->getName(), slice_node->result_name); + actions_dag, slice_node, wrap_arr_nullable_type, slice_node->result_name); const auto * null_const_node = addColumnToActionsDAG(actions_dag, wrap_arr_nullable_type, Field{}); const auto * arr_is_null_node = toFunctionNode(actions_dag, "isNull", {arr_arg}); diff --git a/cpp-ch/local-engine/Parser/scalar_function_parser/tupleElement.cpp b/cpp-ch/local-engine/Parser/scalar_function_parser/tupleElement.cpp index 179aa7860484d..4809cc887b8df 100644 --- a/cpp-ch/local-engine/Parser/scalar_function_parser/tupleElement.cpp +++ b/cpp-ch/local-engine/Parser/scalar_function_parser/tupleElement.cpp @@ -45,7 +45,7 @@ namespace local_engine auto [data_type, field] = parseLiteral(args[1].value().literal()); \ if (!DB::WhichDataType(data_type).isInt32()) \ throw DB::Exception(DB::ErrorCodes::BAD_ARGUMENTS, "{}'s second argument must be i32", #substrait_name); \ - Int32 field_index = static_cast(field.get() + 1); \ + Int32 field_index = static_cast(field.safeGet() + 1); \ const auto * index_node = addColumnToActionsDAG(actions_dag, std::make_shared(), field_index); \ parsed_args.emplace_back(index_node); \ const auto * func_node = toFunctionNode(actions_dag, ch_function_name, parsed_args); \ diff --git a/cpp-ch/local-engine/Shuffle/PartitionWriter.cpp b/cpp-ch/local-engine/Shuffle/PartitionWriter.cpp index a2ef0888aeff5..79d640d3b2bca 100644 --- a/cpp-ch/local-engine/Shuffle/PartitionWriter.cpp +++ b/cpp-ch/local-engine/Shuffle/PartitionWriter.cpp @@ -139,7 +139,7 @@ size_t LocalPartitionWriter::evictPartitions() { auto file = getNextSpillFile(); WriteBufferFromFile output(file, shuffle_writer->options.io_buffer_size); - auto codec = DB::CompressionCodecFactory::instance().get(boost::to_upper_copy(shuffle_writer->options.compress_method), {}); + auto codec = DB::CompressionCodecFactory::instance().get(boost::to_upper_copy(shuffle_writer->options.compress_method), shuffle_writer->options.compress_level); CompressedWriteBuffer compressed_output(output, codec, shuffle_writer->options.io_buffer_size); NativeWriter writer(compressed_output, shuffle_writer->output_header); @@ -200,7 +200,7 @@ String Spillable::getNextSpillFile() std::vector Spillable::mergeSpills(CachedShuffleWriter * shuffle_writer, WriteBuffer & data_file, ExtraData extra_data) { - auto codec = DB::CompressionCodecFactory::instance().get(boost::to_upper_copy(shuffle_writer->options.compress_method), {}); + auto codec = DB::CompressionCodecFactory::instance().get(boost::to_upper_copy(shuffle_writer->options.compress_method), shuffle_writer->options.compress_level); CompressedWriteBuffer compressed_output(data_file, codec, shuffle_writer->options.io_buffer_size); NativeWriter writer(compressed_output, shuffle_writer->output_header); @@ -324,7 +324,7 @@ PartitionWriter::PartitionWriter(CachedShuffleWriter * shuffle_writer_, LoggerPt partition_block_buffer[partition_id] = std::make_shared(options->split_size); partition_buffer[partition_id] = std::make_shared(); } - settings.loadFromContext(SerializedPlanParser::global_context); + settings = MemoryConfig::loadFromContext(SerializedPlanParser::global_context); } size_t PartitionWriter::bytes() const @@ -352,7 +352,7 @@ size_t MemorySortLocalPartitionWriter::evictPartitions() return; auto file = getNextSpillFile(); WriteBufferFromFile output(file, shuffle_writer->options.io_buffer_size); - auto codec = DB::CompressionCodecFactory::instance().get(boost::to_upper_copy(shuffle_writer->options.compress_method), {}); + auto codec = DB::CompressionCodecFactory::instance().get(boost::to_upper_copy(shuffle_writer->options.compress_method), shuffle_writer->options.compress_level); CompressedWriteBuffer compressed_output(output, codec, shuffle_writer->options.io_buffer_size); NativeWriter writer(compressed_output, output_header); @@ -453,7 +453,7 @@ size_t MemorySortCelebornPartitionWriter::evictPartitions() return; WriteBufferFromOwnString output; - auto codec = DB::CompressionCodecFactory::instance().get(boost::to_upper_copy(shuffle_writer->options.compress_method), {}); + auto codec = DB::CompressionCodecFactory::instance().get(boost::to_upper_copy(shuffle_writer->options.compress_method), shuffle_writer->options.compress_level); CompressedWriteBuffer compressed_output(output, codec, shuffle_writer->options.io_buffer_size); NativeWriter writer(compressed_output, shuffle_writer->output_header); @@ -469,6 +469,7 @@ size_t MemorySortCelebornPartitionWriter::evictPartitions() celeborn_client->pushPartitionData(cur_partition_id, data.data(), data.size()); shuffle_writer->split_result.total_io_time += push_time_watch.elapsedNanoseconds(); shuffle_writer->split_result.partition_lengths[cur_partition_id] += data.size(); + shuffle_writer->split_result.total_bytes_written += data.size(); } output.restart(); }; @@ -564,7 +565,7 @@ size_t CelebornPartitionWriter::evictSinglePartition(size_t partition_id) return; WriteBufferFromOwnString output; - auto codec = DB::CompressionCodecFactory::instance().get(boost::to_upper_copy(shuffle_writer->options.compress_method), {}); + auto codec = DB::CompressionCodecFactory::instance().get(boost::to_upper_copy(shuffle_writer->options.compress_method), shuffle_writer->options.compress_level); CompressedWriteBuffer compressed_output(output, codec, shuffle_writer->options.io_buffer_size); NativeWriter writer(compressed_output, shuffle_writer->output_header); @@ -586,6 +587,7 @@ size_t CelebornPartitionWriter::evictSinglePartition(size_t partition_id) shuffle_writer->split_result.total_write_time += push_time_watch.elapsedNanoseconds(); shuffle_writer->split_result.total_io_time += push_time_watch.elapsedNanoseconds(); shuffle_writer->split_result.total_serialize_time += serialization_time_watch.elapsedNanoseconds(); + shuffle_writer->split_result.total_bytes_written += written_bytes; }; Stopwatch spill_time_watch; diff --git a/cpp-ch/local-engine/Shuffle/SelectorBuilder.cpp b/cpp-ch/local-engine/Shuffle/SelectorBuilder.cpp index b9bd02c3ef687..272a6f2f6bee6 100644 --- a/cpp-ch/local-engine/Shuffle/SelectorBuilder.cpp +++ b/cpp-ch/local-engine/Shuffle/SelectorBuilder.cpp @@ -367,19 +367,31 @@ void RangeSelectorBuilder::computePartitionIdByBinarySearch(DB::Block & block, D selector.emplace_back(selected_partition); } } +namespace { +int doCompareAt(const ColumnPtr & lhs, size_t n, size_t m, const IColumn & rhs, int nan_direction_hint) +{ + if (const auto * l_const = typeid_cast(lhs.get())) + { + // we know rhs never be Const + chassert(l_const->getDataType() == rhs.getDataType()); + return l_const->getDataColumn().compareAt(0, m, rhs, nan_direction_hint); + } + return lhs->compareAt(n, m, rhs, nan_direction_hint); +} +} int RangeSelectorBuilder::compareRow( const DB::Columns & columns, const std::vector & required_columns, size_t row, const DB::Columns & bound_columns, - size_t bound_row) + size_t bound_row) const { for (size_t i = 0, n = required_columns.size(); i < n; ++i) { auto lpos = required_columns[i]; auto rpos = i; - auto res = columns[lpos]->compareAt(row, bound_row, *bound_columns[rpos], sort_descriptions[i].nulls_direction) + auto res = doCompareAt(columns[lpos], row, bound_row, *bound_columns[rpos], sort_descriptions[i].nulls_direction) * sort_descriptions[i].direction; if (res != 0) return res; diff --git a/cpp-ch/local-engine/Shuffle/SelectorBuilder.h b/cpp-ch/local-engine/Shuffle/SelectorBuilder.h index 97894daa3c14d..7349849f538e5 100644 --- a/cpp-ch/local-engine/Shuffle/SelectorBuilder.h +++ b/cpp-ch/local-engine/Shuffle/SelectorBuilder.h @@ -118,7 +118,7 @@ class RangeSelectorBuilder : public SelectorBuilder const std::vector & required_columns, size_t row, const DB::Columns & bound_columns, - size_t bound_row); + size_t bound_row) const; int binarySearchBound( const DB::Columns & bound_columns, diff --git a/cpp-ch/local-engine/Shuffle/ShuffleCommon.h b/cpp-ch/local-engine/Shuffle/ShuffleCommon.h index d398362aa4b64..052f6d2e37e99 100644 --- a/cpp-ch/local-engine/Shuffle/ShuffleCommon.h +++ b/cpp-ch/local-engine/Shuffle/ShuffleCommon.h @@ -44,7 +44,7 @@ struct SplitOptions std::string hash_exprs; std::string out_exprs; std::string compress_method = "zstd"; - int compress_level; + std::optional compress_level; size_t spill_threshold = 300 * 1024 * 1024; std::string hash_algorithm; size_t max_sort_buffer_size = 1_GiB; diff --git a/cpp-ch/local-engine/Shuffle/ShuffleWriter.cpp b/cpp-ch/local-engine/Shuffle/ShuffleWriter.cpp index dddf0b895fdff..8aa624ff99797 100644 --- a/cpp-ch/local-engine/Shuffle/ShuffleWriter.cpp +++ b/cpp-ch/local-engine/Shuffle/ShuffleWriter.cpp @@ -25,13 +25,13 @@ using namespace DB; namespace local_engine { ShuffleWriter::ShuffleWriter( - jobject output_stream, jbyteArray buffer, const std::string & codecStr, bool enable_compression, size_t customize_buffer_size) + jobject output_stream, jbyteArray buffer, const std::string & codecStr, jint level, bool enable_compression, size_t customize_buffer_size) { compression_enable = enable_compression; write_buffer = std::make_unique(output_stream, buffer, customize_buffer_size); if (compression_enable) { - auto codec = DB::CompressionCodecFactory::instance().get(boost::to_upper_copy(codecStr), {}); + auto codec = DB::CompressionCodecFactory::instance().get(boost::to_upper_copy(codecStr), level < 0 ? std::nullopt : std::optional(level)); compressed_out = std::make_unique(*write_buffer, codec); } } diff --git a/cpp-ch/local-engine/Shuffle/ShuffleWriter.h b/cpp-ch/local-engine/Shuffle/ShuffleWriter.h index 98f67d1ccadb8..541e93e0347c4 100644 --- a/cpp-ch/local-engine/Shuffle/ShuffleWriter.h +++ b/cpp-ch/local-engine/Shuffle/ShuffleWriter.h @@ -24,7 +24,7 @@ class ShuffleWriter { public: ShuffleWriter( - jobject output_stream, jbyteArray buffer, const std::string & codecStr, bool enable_compression, size_t customize_buffer_size); + jobject output_stream, jbyteArray buffer, const std::string & codecStr, jint level, bool enable_compression, size_t customize_buffer_size); virtual ~ShuffleWriter(); void write(const DB::Block & block); void flush(); diff --git a/cpp-ch/local-engine/Storages/Cache/CacheManager.cpp b/cpp-ch/local-engine/Storages/Cache/CacheManager.cpp index d2c7b06810db6..0dc852a901105 100644 --- a/cpp-ch/local-engine/Storages/Cache/CacheManager.cpp +++ b/cpp-ch/local-engine/Storages/Cache/CacheManager.cpp @@ -16,21 +16,23 @@ */ #include "CacheManager.h" +#include #include #include #include -#include +#include #include -#include -#include +#include #include #include #include -#include #include +#include #include +#include #include -#include + +#include namespace DB { @@ -49,6 +51,16 @@ extern const Metric LocalThreadScheduled; namespace local_engine { + +jclass CacheManager::cache_result_class = nullptr; +jmethodID CacheManager::cache_result_constructor = nullptr; + +void CacheManager::initJNI(JNIEnv * env) +{ + cache_result_class = CreateGlobalClassReference(env, "Lorg/apache/gluten/execution/CacheResult;"); + cache_result_constructor = GetMethodID(env, cache_result_class, "", "(ILjava/lang/String;)V"); +} + CacheManager & CacheManager::instance() { static CacheManager cache_manager; @@ -59,13 +71,6 @@ void CacheManager::initialize(DB::ContextMutablePtr context_) { auto & manager = instance(); manager.context = context_; - manager.thread_pool = std::make_unique( - CurrentMetrics::LocalThread, - CurrentMetrics::LocalThreadActive, - CurrentMetrics::LocalThreadScheduled, - manager.context->getConfigRef().getInt("cache_sync_max_threads", 10), - 0, - 0); } struct CacheJobContext @@ -73,17 +78,16 @@ struct CacheJobContext MergeTreeTable table; }; -void CacheManager::cachePart(const MergeTreeTable& table, const MergeTreePart& part, const std::unordered_set & columns, std::shared_ptr latch) +Task CacheManager::cachePart(const MergeTreeTable& table, const MergeTreePart& part, const std::unordered_set & columns) { CacheJobContext job_context{table}; job_context.table.parts.clear(); job_context.table.parts.push_back(part); job_context.table.snapshot_id = ""; - auto job = [job_detail = job_context, context = this->context, read_columns = columns, latch = latch]() + Task task = [job_detail = job_context, context = this->context, read_columns = columns]() { try { - SCOPE_EXIT({ if (latch) latch->count_down();}); auto storage = MergeTreeRelParser::parseStorage(job_detail.table, context, true); auto storage_snapshot = std::make_shared(*storage, storage->getInMemoryMetadataPtr()); NamesAndTypesList names_and_types_list; @@ -113,8 +117,7 @@ void CacheManager::cachePart(const MergeTreeTable& table, const MergeTreePart& p PullingPipelineExecutor executor(pipeline); while (true) { - Chunk chunk; - if (!executor.pull(chunk)) + if (Chunk chunk; !executor.pull(chunk)) break; } LOG_INFO(getLogger("CacheManager"), "Load cache of table {}.{} part {} success.", job_detail.table.database, job_detail.table.table, job_detail.table.parts.front().name); @@ -122,22 +125,116 @@ void CacheManager::cachePart(const MergeTreeTable& table, const MergeTreePart& p catch (std::exception& e) { LOG_ERROR(getLogger("CacheManager"), "Load cache of table {}.{} part {} failed.\n {}", job_detail.table.database, job_detail.table.table, job_detail.table.parts.front().name, e.what()); + std::rethrow_exception(std::current_exception()); } }; LOG_INFO(getLogger("CacheManager"), "Loading cache of table {}.{} part {}", job_context.table.database, job_context.table.table, job_context.table.parts.front().name); - thread_pool->scheduleOrThrowOnError(std::move(job)); + return std::move(task); } -void CacheManager::cacheParts(const String& table_def, const std::unordered_set& columns, bool async) +JobId CacheManager::cacheParts(const String& table_def, const std::unordered_set& columns) { auto table = parseMergeTreeTableString(table_def); - std::shared_ptr latch = nullptr; - if (!async) latch = std::make_shared(table.parts.size()); + JobId id = toString(UUIDHelpers::generateV4()); + Job job(id); for (const auto & part : table.parts) { - cachePart(table, part, columns, latch); + job.addTask(cachePart(table, part, columns)); + } + auto& scheduler = JobScheduler::instance(); + scheduler.scheduleJob(std::move(job)); + return id; +} + +jobject CacheManager::getCacheStatus(JNIEnv * env, const String & jobId) +{ + auto& scheduler = JobScheduler::instance(); + auto job_status = scheduler.getJobSatus(jobId); + int status = 0; + String message; + if (job_status.has_value()) + { + switch (job_status.value().status) + { + case JobSatus::RUNNING: + status = 0; + break; + case JobSatus::FINISHED: + status = 1; + break; + case JobSatus::FAILED: + status = 2; + for (const auto & msg : job_status->messages) + { + message.append(msg); + message.append(";"); + } + break; + } + } + else + { + status = 2; + message = fmt::format("job {} not found", jobId); + } + return env->NewObject(cache_result_class, cache_result_constructor, status, charTojstring(env, message.c_str())); +} + +Task CacheManager::cacheFile(const substrait::ReadRel::LocalFiles::FileOrFiles & file, ReadBufferBuilderPtr read_buffer_builder) +{ + auto task = [file, read_buffer_builder, context = this->context]() + { + LOG_INFO(getLogger("CacheManager"), "Loading cache file {}", file.uri_file()); + + try + { + std::unique_ptr rb = read_buffer_builder->build(file); + while (!rb->eof()) + rb->ignoreAll(); + } + catch (std::exception & e) + { + LOG_ERROR(getLogger("CacheManager"), "Load cache file {} failed.\n {}", file.uri_file(), e.what()); + std::rethrow_exception(std::current_exception()); + } + }; + + return std::move(task); +} + +JobId CacheManager::cacheFiles(substrait::ReadRel::LocalFiles file_infos) +{ + JobId id = toString(UUIDHelpers::generateV4()); + Job job(id); + + if (file_infos.items_size()) + { + const Poco::URI file_uri(file_infos.items().Get(0).uri_file()); + const auto read_buffer_builder = ReadBufferBuilderFactory::instance().createBuilder(file_uri.getScheme(), context); + + if (read_buffer_builder->file_cache) + for (const auto & file : file_infos.items()) + job.addTask(cacheFile(file, read_buffer_builder)); + else + LOG_WARNING(getLogger("CacheManager"), "Load cache skipped because cache not enabled."); + } + + auto & scheduler = JobScheduler::instance(); + scheduler.scheduleJob(std::move(job)); + return id; +} + +void CacheManager::removeFiles(String file, String cache_name) +{ + // only for ut + for (const auto & [name, file_cache] : FileCacheFactory::instance().getAll()) + { + if (name != cache_name) + continue; + + if (const auto cache = file_cache->cache) + cache->removePathIfExists(file, DB::FileCache::getCommonUser().user_id); } - if (latch) - latch->wait(); } + } \ No newline at end of file diff --git a/cpp-ch/local-engine/Storages/Cache/CacheManager.h b/cpp-ch/local-engine/Storages/Cache/CacheManager.h index a303b7b7fc63e..6335f86bb162d 100644 --- a/cpp-ch/local-engine/Storages/Cache/CacheManager.h +++ b/cpp-ch/local-engine/Storages/Cache/CacheManager.h @@ -15,30 +15,41 @@ * limitations under the License. */ #pragma once -#include -#include +#include + +#include +#include +#include +#include namespace local_engine { struct MergeTreePart; struct MergeTreeTable; + + + /*** * Manage the cache of the MergeTree, mainly including meta.bin, data.bin, metadata.gluten */ class CacheManager { public: + static jclass cache_result_class; + static jmethodID cache_result_constructor; + static void initJNI(JNIEnv* env); + static CacheManager & instance(); static void initialize(DB::ContextMutablePtr context); - void cachePart(const MergeTreeTable& table, const MergeTreePart& part, const std::unordered_set& columns, std::shared_ptr latch = nullptr); - void cacheParts(const String& table_def, const std::unordered_set& columns, bool async = true); + Task cachePart(const MergeTreeTable& table, const MergeTreePart& part, const std::unordered_set& columns); + JobId cacheParts(const String& table_def, const std::unordered_set& columns); + static jobject getCacheStatus(JNIEnv * env, const String& jobId); + + Task cacheFile(const substrait::ReadRel::LocalFiles::FileOrFiles & file, ReadBufferBuilderPtr read_buffer_builder); + JobId cacheFiles(substrait::ReadRel::LocalFiles file_infos); + static void removeFiles(String file, String cache_name); private: CacheManager() = default; - - std::unique_ptr thread_pool; DB::ContextMutablePtr context; - std::unordered_map policy_to_disk; - std::unordered_map disk_to_metadisk; - std::unordered_map policy_to_cache; }; } \ No newline at end of file diff --git a/cpp-ch/local-engine/Storages/Cache/JobScheduler.cpp b/cpp-ch/local-engine/Storages/Cache/JobScheduler.cpp new file mode 100644 index 0000000000000..6a43ad644433c --- /dev/null +++ b/cpp-ch/local-engine/Storages/Cache/JobScheduler.cpp @@ -0,0 +1,163 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + + +#include "JobScheduler.h" + +#include +#include +#include +#include + +namespace DB +{ +namespace ErrorCodes +{ +extern const int BAD_ARGUMENTS; +} +} + +namespace CurrentMetrics +{ +extern const Metric LocalThread; +extern const Metric LocalThreadActive; +extern const Metric LocalThreadScheduled; +} + +namespace local_engine +{ +std::shared_ptr global_job_scheduler = nullptr; + +void JobScheduler::initialize(DB::ContextPtr context) +{ + auto config = GlutenJobSchedulerConfig::loadFromContext(context); + instance().thread_pool = std::make_unique( + CurrentMetrics::LocalThread, + CurrentMetrics::LocalThreadActive, + CurrentMetrics::LocalThreadScheduled, + config.job_scheduler_max_threads, + 0, + 0); + +} + +JobId JobScheduler::scheduleJob(Job&& job) +{ + cleanFinishedJobs(); + if (job_details.contains(job.id)) + { + throw DB::Exception(DB::ErrorCodes::BAD_ARGUMENTS, "job {} exists.", job.id); + } + size_t task_num = job.tasks.size(); + auto job_id = job.id; + std::vector task_results; + task_results.reserve(task_num); + JobContext job_context = {std::move(job), std::make_unique(task_num), std::move(task_results)}; + { + std::lock_guard lock(job_details_mutex); + job_details.emplace(job_id, std::move(job_context)); + } + LOG_INFO(logger, "schedule job {}", job_id); + + auto & job_detail = job_details.at(job_id); + + for (auto & task : job_detail.job.tasks) + { + job_detail.task_results.emplace_back(TaskResult()); + auto & task_result = job_detail.task_results.back(); + thread_pool->scheduleOrThrow( + [&]() + { + SCOPE_EXIT({ + job_detail.remain_tasks->fetch_sub(1, std::memory_order::acquire); + if (job_detail.isFinished()) + { + addFinishedJob(job_detail.job.id); + } + }); + try + { + task(); + task_result.status = TaskResult::Status::SUCCESS; + } + catch (std::exception & e) + { + task_result.status = TaskResult::Status::FAILED; + task_result.message = e.what(); + } + }); + } + return job_id; +} + +std::optional JobScheduler::getJobSatus(const JobId & job_id) +{ + if (!job_details.contains(job_id)) + { + return std::nullopt; + } + std::optional res; + auto & job_context = job_details.at(job_id); + if (job_context.isFinished()) + { + std::vector messages; + for (auto & task_result : job_context.task_results) + { + if (task_result.status == TaskResult::Status::FAILED) + { + messages.push_back(task_result.message); + } + } + if (messages.empty()) + res = JobSatus::success(); + else + res= JobSatus::failed(messages); + } + else + res = JobSatus::running(); + return res; +} + +void JobScheduler::cleanupJob(const JobId & job_id) +{ + LOG_INFO(logger, "clean job {}", job_id); + job_details.erase(job_id); +} + +void JobScheduler::addFinishedJob(const JobId & job_id) +{ + std::lock_guard lock(finished_job_mutex); + auto job = std::make_pair(job_id, Stopwatch()); + finished_job.emplace_back(job); +} + +void JobScheduler::cleanFinishedJobs() +{ + std::lock_guard lock(finished_job_mutex); + for (auto it = finished_job.begin(); it != finished_job.end();) + { + // clean finished job after 5 minutes + if (it->second.elapsedSeconds() > 60 * 5) + { + cleanupJob(it->first); + it = finished_job.erase(it); + } + else + ++it; + } +} +} \ No newline at end of file diff --git a/cpp-ch/local-engine/Storages/Cache/JobScheduler.h b/cpp-ch/local-engine/Storages/Cache/JobScheduler.h new file mode 100644 index 0000000000000..b5c2f601a92b9 --- /dev/null +++ b/cpp-ch/local-engine/Storages/Cache/JobScheduler.h @@ -0,0 +1,132 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once +#include +#include +#include +#include + +namespace local_engine +{ + +using JobId = String; +using Task = std::function; + +class Job +{ + friend class JobScheduler; +public: + explicit Job(const JobId& id) + : id(id) + { + } + + void addTask(Task&& task) + { + tasks.emplace_back(task); + } + +private: + JobId id; + std::vector tasks; +}; + + + +struct JobSatus +{ + enum Status + { + RUNNING, + FINISHED, + FAILED + }; + Status status; + std::vector messages; + + static JobSatus success() + { + return JobSatus{FINISHED}; + } + + static JobSatus running() + { + return JobSatus{RUNNING}; + } + + static JobSatus failed(const std::vector & messages) + { + return JobSatus{FAILED, messages}; + } +}; + +struct TaskResult +{ + enum Status + { + SUCCESS, + FAILED, + RUNNING + }; + Status status = RUNNING; + String message; +}; + +class JobContext +{ +public: + Job job; + std::unique_ptr remain_tasks = std::make_unique(); + std::vector task_results; + + bool isFinished() + { + return remain_tasks->load(std::memory_order::relaxed) == 0; + } +}; + +class JobScheduler +{ +public: + static JobScheduler& instance() + { + static JobScheduler global_job_scheduler; + return global_job_scheduler; + } + + static void initialize(DB::ContextPtr context); + + JobId scheduleJob(Job&& job); + + std::optional getJobSatus(const JobId& job_id); + + void cleanupJob(const JobId& job_id); + + void addFinishedJob(const JobId& job_id); + + void cleanFinishedJobs(); +private: + JobScheduler() = default; + std::unique_ptr thread_pool; + std::unordered_map job_details; + std::mutex job_details_mutex; + + std::vector> finished_job; + std::mutex finished_job_mutex; + LoggerPtr logger = getLogger("JobScheduler"); +}; +} diff --git a/cpp-ch/local-engine/Storages/CustomStorageMergeTree.cpp b/cpp-ch/local-engine/Storages/CustomStorageMergeTree.cpp index 961f482c7cae9..7336d7db5190d 100644 --- a/cpp-ch/local-engine/Storages/CustomStorageMergeTree.cpp +++ b/cpp-ch/local-engine/Storages/CustomStorageMergeTree.cpp @@ -262,16 +262,16 @@ MergeTreeData::LoadPartResult CustomStorageMergeTree::loadDataPart( return res; } -void CustomStorageMergeTree::removePartFromMemory(const MergeTreeData::DataPartPtr & part_to_detach) +void CustomStorageMergeTree::removePartFromMemory(const MergeTreeData::DataPart & part_to_detach) { auto lock = lockParts(); bool removed_active_part = false; bool restored_active_part = false; - auto it_part = data_parts_by_info.find(part_to_detach->info); + auto it_part = data_parts_by_info.find(part_to_detach.info); if (it_part == data_parts_by_info.end()) { - LOG_DEBUG(log, "No such data part {}", part_to_detach->getNameWithState()); + LOG_DEBUG(log, "No such data part {}", part_to_detach.getNameWithState()); return; } diff --git a/cpp-ch/local-engine/Storages/CustomStorageMergeTree.h b/cpp-ch/local-engine/Storages/CustomStorageMergeTree.h index 9144aba429c0a..773e5858c24ff 100644 --- a/cpp-ch/local-engine/Storages/CustomStorageMergeTree.h +++ b/cpp-ch/local-engine/Storages/CustomStorageMergeTree.h @@ -54,7 +54,7 @@ class CustomStorageMergeTree final : public MergeTreeData bool scheduleDataProcessingJob(BackgroundJobsAssignee & executor) override; std::map getUnfinishedMutationCommands() const override; std::vector loadDataPartsWithNames(std::unordered_set parts); - void removePartFromMemory(const MergeTreeData::DataPartPtr & part_to_detach); + void removePartFromMemory(const MergeTreeData::DataPart & part_to_detach); MergeTreeDataWriter writer; MergeTreeDataSelectExecutor reader; diff --git a/cpp-ch/local-engine/Storages/Mergetree/SparkMergeTreeWriter.cpp b/cpp-ch/local-engine/Storages/Mergetree/SparkMergeTreeWriter.cpp index 6fee65efe5931..93f4374d4ce1c 100644 --- a/cpp-ch/local-engine/Storages/Mergetree/SparkMergeTreeWriter.cpp +++ b/cpp-ch/local-engine/Storages/Mergetree/SparkMergeTreeWriter.cpp @@ -71,16 +71,16 @@ SparkMergeTreeWriter::SparkMergeTreeWriter( , thread_pool(CurrentMetrics::LocalThread, CurrentMetrics::LocalThreadActive, CurrentMetrics::LocalThreadScheduled, 1, 1, 100000) { const DB::Settings & settings = context->getSettingsRef(); - merge_after_insert = settings.get(MERGETREE_MERGE_AFTER_INSERT).get(); - insert_without_local_storage = settings.get(MERGETREE_INSERT_WITHOUT_LOCAL_STORAGE).get(); + merge_after_insert = settings.get(MERGETREE_MERGE_AFTER_INSERT).safeGet(); + insert_without_local_storage = settings.get(MERGETREE_INSERT_WITHOUT_LOCAL_STORAGE).safeGet(); Field limit_size_field; if (settings.tryGet("optimize.minFileSize", limit_size_field)) - merge_min_size = limit_size_field.get() <= 0 ? merge_min_size : limit_size_field.get(); + merge_min_size = limit_size_field.safeGet() <= 0 ? merge_min_size : limit_size_field.safeGet(); Field limit_cnt_field; if (settings.tryGet("mergetree.max_num_part_per_merge_task", limit_cnt_field)) - merge_limit_parts = limit_cnt_field.get() <= 0 ? merge_limit_parts : limit_cnt_field.get(); + merge_limit_parts = limit_cnt_field.safeGet() <= 0 ? merge_limit_parts : limit_cnt_field.safeGet(); dest_storage = MergeTreeRelParser::parseStorage(merge_tree_table, SerializedPlanParser::global_context); isRemoteStorage = dest_storage->getStoragePolicy()->getAnyDisk()->isRemote(); diff --git a/cpp-ch/local-engine/Storages/Parquet/ParquetConverter.h b/cpp-ch/local-engine/Storages/Parquet/ParquetConverter.h index 89e83e668aebc..0ac16c11104d8 100644 --- a/cpp-ch/local-engine/Storages/Parquet/ParquetConverter.h +++ b/cpp-ch/local-engine/Storages/Parquet/ParquetConverter.h @@ -35,12 +35,23 @@ template struct ToParquet { using T = typename PhysicalType::c_type; - T as(const DB::Field & value, const parquet::ColumnDescriptor &) + T as(const DB::Field & value, const parquet::ColumnDescriptor & s) { - if constexpr (std::is_same_v) - return static_cast(value.get()); + if (s.logical_type()->is_decimal()) + { + if constexpr (std::is_same_v) + { + const auto v = value.safeGet>(); + return v.getValue().value; + } + if constexpr (std::is_same_v) + { + const auto v = value.safeGet>(); + return v.getValue().value; + } + } // parquet::BooleanType, parquet::Int64Type, parquet::FloatType, parquet::DoubleType - return value.get(); // FLOAT, DOUBLE, INT64 + return value.safeGet(); // FLOAT, DOUBLE, INT64, Int32 } }; @@ -51,34 +62,50 @@ struct ToParquet T as(const DB::Field & value, const parquet::ColumnDescriptor &) { assert(value.getType() == DB::Field::Types::String); - const std::string & s = value.get(); + const std::string & s = value.safeGet(); const auto * const ptr = reinterpret_cast(s.data()); return parquet::ByteArray(static_cast(s.size()), ptr); } }; +template +parquet::FixedLenByteArray convertField(const DB::Field & value, uint8_t * buf, size_t type_length) +{ + assert(sizeof(T) >= type_length); + + T val = value.safeGet>>().getValue().value; + std::reverse(reinterpret_cast(&val), reinterpret_cast(&val) + sizeof(T)); + const int offset = sizeof(T) - type_length; + + memcpy(buf, reinterpret_cast(&val) + offset, type_length); + return parquet::FixedLenByteArray(buf); +} + template <> struct ToParquet { - uint8_t buf[256]; + uint8_t buf[16]; using T = parquet::FixedLenByteArray; T as(const DB::Field & value, const parquet::ColumnDescriptor & descriptor) { - if (value.getType() != DB::Field::Types::Decimal128) - throw DB::Exception( - DB::ErrorCodes::LOGICAL_ERROR, "Field type '{}' for FIXED_LEN_BYTE_ARRAY is not supported", value.getTypeName()); - static_assert(sizeof(Int128) <= sizeof(buf)); - if (descriptor.type_length() > sizeof(Int128)) + if (value.getType() == DB::Field::Types::Decimal256) + throw DB::Exception(DB::ErrorCodes::LOGICAL_ERROR, "Field type '{}' is not supported", value.getTypeName()); + + static_assert(sizeof(Int128) == sizeof(buf)); + + if (descriptor.type_length() > sizeof(buf)) throw DB::Exception( DB::ErrorCodes::LOGICAL_ERROR, - "descriptor.type_length() = {} , which is > {}, e.g. sizeof(Int128)", + "descriptor.type_length() = {} , which is > {}, e.g. sizeof(buf)", descriptor.type_length(), - sizeof(Int128)); - Int128 val = value.get>().getValue(); - std::reverse(reinterpret_cast(&val), reinterpret_cast(&val) + sizeof(val)); - const int offset = sizeof(Int128) - descriptor.type_length(); - memcpy(buf, reinterpret_cast(&val) + offset, descriptor.type_length()); - return parquet::FixedLenByteArray(buf); + sizeof(buf)); + + if (value.getType() == DB::Field::Types::Decimal32) + return convertField(value, buf, descriptor.type_length()); + if (value.getType() == DB::Field::Types::Decimal64) + return convertField(value, buf, descriptor.type_length()); + + return convertField(value, buf, descriptor.type_length()); } }; @@ -86,7 +113,7 @@ struct ToParquet template struct ConverterNumeric { - using From = typename Col::Container::value_type; + using From = typename Col::ValueType; using To = typename DType::c_type; const Col & column; @@ -119,6 +146,7 @@ using ConverterInt64 = ConverterNumeric>; using ConverterDouble = ConverterNumeric>; +using ConverterFloat = ConverterNumeric>; struct ConverterString { @@ -141,7 +169,7 @@ struct ConverterString /// Like ConverterNumberAsFixedString, but converts to big-endian. Because that's the byte order /// Parquet uses for decimal types and literally nothing else, for some reason. -template +template struct ConverterDecimal { const parquet::ColumnDescriptor & descriptor; @@ -165,7 +193,7 @@ struct ConverterDecimal data_buf.resize(count * sizeof(T)); ptr_buf.resize(count); memcpy(data_buf.data(), reinterpret_cast(column.getData().data() + offset), count * sizeof(T)); - const size_t offset_in_buf = sizeof(Int128) - descriptor.type_length(); + const size_t offset_in_buf = sizeof(T) - descriptor.type_length(); ; for (size_t i = 0; i < count; ++i) { @@ -176,6 +204,13 @@ struct ConverterDecimal } }; +using Decimal128ToFLB = ConverterDecimal; +using Decimal64ToFLB = ConverterDecimal; +using Decimal32ToFLB = ConverterDecimal; + +using ConverterDecimal32 = ConverterNumeric>; +using ConverterDecimal64 = ConverterNumeric>; + class BaseConverter { public: @@ -239,6 +274,8 @@ std::shared_ptr> ParquetConverter::Make(const DB: case TypeIndex::UInt32: result = std::make_shared>(ConverterInt32_u(c)); break; + case TypeIndex::Decimal32: + result = std::make_shared>(ConverterDecimal32(c)); default: break; } @@ -251,6 +288,8 @@ std::shared_ptr> ParquetConverter::Make(const DB: case TypeIndex::UInt64: result = std::make_shared>(ConverterInt64_u(c)); break; + case TypeIndex::Decimal64: + result = std::make_shared>(ConverterDecimal64(c)); default: break; } @@ -258,6 +297,14 @@ std::shared_ptr> ParquetConverter::Make(const DB: case parquet::Type::INT96: break; case parquet::Type::FLOAT: + switch (c->getDataType()) + { + case TypeIndex::Float32: + result = std::make_shared>(ConverterFloat(c)); + break; + default: + break; + } break; case parquet::Type::DOUBLE: switch (c->getDataType()) @@ -283,8 +330,13 @@ std::shared_ptr> ParquetConverter::Make(const DB: switch (c->getDataType()) { case TypeIndex::Decimal128: - result = std::make_shared>>( - ConverterDecimal(c, desc)); + result = std::make_shared>(Decimal128ToFLB(c, desc)); + break; + case TypeIndex::Decimal64: + result = std::make_shared>(Decimal64ToFLB(c, desc)); + break; + case TypeIndex::Decimal32: + result = std::make_shared>(Decimal32ToFLB(c, desc)); break; default: break; diff --git a/cpp-ch/local-engine/Storages/Parquet/VectorizedParquetRecordReader.h b/cpp-ch/local-engine/Storages/Parquet/VectorizedParquetRecordReader.h index a9c796a7556ba..b930df0cac956 100644 --- a/cpp-ch/local-engine/Storages/Parquet/VectorizedParquetRecordReader.h +++ b/cpp-ch/local-engine/Storages/Parquet/VectorizedParquetRecordReader.h @@ -235,7 +235,7 @@ class VectorizedParquetBlockInputFormat final : public DB::IInputFormat ColumnIndexFilterPtr column_index_filter_; protected: - void onCancel() override { is_stopped = 1; } + void onCancel() noexcept override { is_stopped = 1; } public: VectorizedParquetBlockInputFormat(DB::ReadBuffer & in_, const DB::Block & header_, const DB::FormatSettings & format_settings); diff --git a/cpp-ch/local-engine/Storages/SourceFromJavaIter.cpp b/cpp-ch/local-engine/Storages/SourceFromJavaIter.cpp index 1c5902c8ca67b..f123b7c74f41c 100644 --- a/cpp-ch/local-engine/Storages/SourceFromJavaIter.cpp +++ b/cpp-ch/local-engine/Storages/SourceFromJavaIter.cpp @@ -15,25 +15,20 @@ * limitations under the License. */ #include "SourceFromJavaIter.h" -#include -#include -#include -#include -#include -#include -#include +#include #include #include -#include #include -#include #include #include -#include -#include -#include -#include -#include + +namespace DB +{ +namespace ErrorCodes +{ + extern const int LOGICAL_ERROR; +} +} namespace local_engine { @@ -41,12 +36,38 @@ jclass SourceFromJavaIter::serialized_record_batch_iterator_class = nullptr; jmethodID SourceFromJavaIter::serialized_record_batch_iterator_hasNext = nullptr; jmethodID SourceFromJavaIter::serialized_record_batch_iterator_next = nullptr; - -static DB::Block getRealHeader(const DB::Block & header) +static DB::Block getRealHeader(const DB::Block & header, const DB::Block * first_block) { - if (header.columns()) + if (!header) + return BlockUtil::buildRowCountHeader(); + + if (!first_block) return header; - return BlockUtil::buildRowCountHeader(); + + if (header.columns() != first_block->columns()) + throw DB::Exception( + DB::ErrorCodes::LOGICAL_ERROR, + "Header first block have different number of columns, header:{} first_block:{}", + header.dumpStructure(), + first_block->dumpStructure()); + + DB::Block result; + const size_t column_size = header.columns(); + for (size_t i = 0; i < column_size; ++i) + { + const auto & header_column = header.getByPosition(i); + const auto & input_column = first_block->getByPosition(i); + chassert(header_column.name == input_column.name); + + DB::WhichDataType input_which(input_column.type); + /// Some AggregateFunctions may have parameters, so we need to use the exact type from the first block. + /// e.g. spark approx_percentile -> CH quantilesGK(accuracy, level1, level2, ...), the intermediate result type + /// parsed from substrait plan is always AggregateFunction(10000, 1)(quantilesGK, arg_type), which maybe different + /// from the actual intermediate result type from input block. So we need to use the exact type from the input block. + auto type = input_which.isAggregateFunction() ? input_column.type : header_column.type; + result.insert(DB::ColumnWithTypeAndName(type, header_column.name)); + } + return result; } @@ -62,8 +83,8 @@ DB::Block * SourceFromJavaIter::peekBlock(JNIEnv * env, jobject java_iter) SourceFromJavaIter::SourceFromJavaIter( - DB::ContextPtr context_, DB::Block header, jobject java_iter_, bool materialize_input_, DB::Block * first_block_) - : DB::ISource(getRealHeader(header)) + DB::ContextPtr context_, const DB::Block & header, jobject java_iter_, bool materialize_input_, const DB::Block * first_block_) + : DB::ISource(getRealHeader(header, first_block_)) , context(context_) , original_header(header) , java_iter(java_iter_) @@ -80,43 +101,50 @@ DB::Chunk SourceFromJavaIter::generate() GET_JNIENV(env) SCOPE_EXIT({CLEAN_JNIENV}); - DB::Chunk result; - DB::Block * data = nullptr; + DB::Block * input_block = nullptr; if (first_block) [[unlikely]] { - data = first_block; + input_block = const_cast(first_block); first_block = nullptr; } else if (jboolean has_next = safeCallBooleanMethod(env, java_iter, serialized_record_batch_iterator_hasNext)) { jbyteArray block = static_cast(safeCallObjectMethod(env, java_iter, serialized_record_batch_iterator_next)); - data = reinterpret_cast(byteArrayToLong(env, block)); + input_block = reinterpret_cast(byteArrayToLong(env, block)); } else return {}; - /// Post-processing - if (materialize_input) - materializeBlockInplace(*data); - - if (data->rows() > 0) + DB::Chunk result; + if (original_header) { - size_t rows = data->rows(); - if (original_header.columns()) + const auto & header = getPort().getHeader(); + chassert(header.columns() == input_block->columns()); + /// Cast all input columns in data to expected data types in header + for (size_t i = 0; i < header.columns(); ++i) { - result.setColumns(data->mutateColumns(), rows); - convertNullable(result); - auto info = std::make_shared(); - info->is_overflows = data->info.is_overflows; - info->bucket_num = data->info.bucket_num; - result.getChunkInfos().add(std::move(info)); - } - else - { - result = BlockUtil::buildRowCountChunk(rows); - auto info = std::make_shared(); - result.getChunkInfos().add(std::move(info)); + auto & input_column = input_block->getByPosition(i); + const auto & expected_type = header.getByPosition(i).type; + auto column = DB::castColumn(input_column, expected_type); + input_column.column = column; + input_column.type = expected_type; } + + /// Do materializing after casting is faster than materializing before casting + if (materialize_input) + materializeBlockInplace(*input_block); + + auto info = std::make_shared(); + info->is_overflows = input_block->info.is_overflows; + info->bucket_num = input_block->info.bucket_num; + result.getChunkInfos().add(std::move(info)); + result.setColumns(input_block->getColumns(), input_block->rows()); + } + else + { + result = BlockUtil::buildRowCountChunk(input_block->rows()); + auto info = std::make_shared(); + result.getChunkInfos().add(std::move(info)); } return result; } @@ -140,87 +168,4 @@ Int64 SourceFromJavaIter::byteArrayToLong(JNIEnv * env, jbyteArray arr) return result; } -void SourceFromJavaIter::convertNullable(DB::Chunk & chunk) -{ - auto output = this->getOutputs().front().getHeader(); - auto rows = chunk.getNumRows(); - auto columns = chunk.detachColumns(); - for (size_t i = 0; i < columns.size(); ++i) - { - const auto & column = columns.at(i); - const auto & type = output.getByPosition(i).type; - columns[i] = convertNestedNullable(column, type); - } - chunk.setColumns(columns, rows); -} - - -DB::ColumnPtr SourceFromJavaIter::convertNestedNullable(const DB::ColumnPtr & column, const DB::DataTypePtr & target_type) -{ - DB::WhichDataType column_type(column->getDataType()); - if (column_type.isAggregateFunction()) - return column; - - if (DB::isColumnConst(*column)) - { - const auto & data_column = assert_cast(*column).getDataColumnPtr(); - const auto & result_column = convertNestedNullable(data_column, target_type); - return DB::ColumnConst::create(result_column, column->size()); - } - - // if target type is non-nullable, the column type must be also non-nullable, recursively converting it's nested type - // if target type is nullable, the column type may be nullable or non-nullable, converting it and then recursively converting it's nested type - DB::ColumnPtr new_column = column; - if (!column_type.isNullable() && target_type->isNullable()) - new_column = DB::makeNullable(column); - - DB::ColumnPtr nested_column = new_column; - DB::DataTypePtr nested_target_type = removeNullable(target_type); - if (new_column->isNullable()) - { - const auto & nullable_col = typeid_cast(new_column->getPtr().get()); - nested_column = nullable_col->getNestedColumnPtr(); - const auto & result_column = convertNestedNullable(nested_column, nested_target_type); - return DB::ColumnNullable::create(result_column, nullable_col->getNullMapColumnPtr()); - } - - DB::WhichDataType nested_column_type(nested_column->getDataType()); - if (nested_column_type.isMap()) - { - // header: Map(String, Nullable(String)) - // chunk: Map(String, String) - const auto & array_column = assert_cast(*nested_column).getNestedColumn(); - const auto & map_type = assert_cast(*nested_target_type); - auto tuple_columns = assert_cast(array_column.getDataPtr().get())->getColumns(); - // only convert for value column as key is always non-nullable - const auto & value_column = convertNestedNullable(tuple_columns[1], map_type.getValueType()); - auto result_column = DB::ColumnArray::create(DB::ColumnTuple::create(DB::Columns{tuple_columns[0], value_column}), array_column.getOffsetsPtr()); - return DB::ColumnMap::create(std::move(result_column)); - } - - if (nested_column_type.isArray()) - { - // header: Array(Nullable(String)) - // chunk: Array(String) - const auto & list_column = assert_cast(*nested_column); - auto nested_type = assert_cast(*nested_target_type).getNestedType(); - const auto & result_column = convertNestedNullable(list_column.getDataPtr(), nested_type); - return DB::ColumnArray::create(result_column, list_column.getOffsetsPtr()); - } - - if (nested_column_type.isTuple()) - { - // header: Tuple(Nullable(String), Nullable(String)) - // chunk: Tuple(String, Nullable(String)) - const auto & tuple_column = assert_cast(*nested_column); - auto nested_types = assert_cast(*nested_target_type).getElements(); - DB::Columns columns; - for (size_t i = 0; i != tuple_column.tupleSize(); ++i) - columns.push_back(convertNestedNullable(tuple_column.getColumnPtr(i), nested_types[i])); - return DB::ColumnTuple::create(std::move(columns)); - } - - return new_column; -} - } diff --git a/cpp-ch/local-engine/Storages/SourceFromJavaIter.h b/cpp-ch/local-engine/Storages/SourceFromJavaIter.h index 6ee02e7480a0f..80ac42b7a2ddf 100644 --- a/cpp-ch/local-engine/Storages/SourceFromJavaIter.h +++ b/cpp-ch/local-engine/Storages/SourceFromJavaIter.h @@ -16,10 +16,9 @@ */ #pragma once #include -#include -#include #include - +#include +#include namespace local_engine { class SourceFromJavaIter : public DB::ISource @@ -30,18 +29,16 @@ class SourceFromJavaIter : public DB::ISource static jmethodID serialized_record_batch_iterator_next; static Int64 byteArrayToLong(JNIEnv * env, jbyteArray arr); - static DB::Block * peekBlock(JNIEnv * env, jobject java_iter); - SourceFromJavaIter(DB::ContextPtr context_, DB::Block header, jobject java_iter_, bool materialize_input_, DB::Block * peek_block_); + SourceFromJavaIter( + DB::ContextPtr context_, const DB::Block & header, jobject java_iter_, bool materialize_input_, const DB::Block * peek_block_); ~SourceFromJavaIter() override; String getName() const override { return "SourceFromJavaIter"; } private: DB::Chunk generate() override; - void convertNullable(DB::Chunk & chunk); - DB::ColumnPtr convertNestedNullable(const DB::ColumnPtr & column, const DB::DataTypePtr & target_type); DB::ContextPtr context; DB::Block original_header; @@ -49,7 +46,7 @@ class SourceFromJavaIter : public DB::ISource bool materialize_input; /// The first block read from java iteration to decide exact types of columns, especially for AggregateFunctions with parameters. - DB::Block * first_block = nullptr; + const DB::Block * first_block = nullptr; }; } diff --git a/cpp-ch/local-engine/Storages/StorageMergeTreeFactory.cpp b/cpp-ch/local-engine/Storages/StorageMergeTreeFactory.cpp index 3f7aac8724a09..eefd1c5fd1ec9 100644 --- a/cpp-ch/local-engine/Storages/StorageMergeTreeFactory.cpp +++ b/cpp-ch/local-engine/Storages/StorageMergeTreeFactory.cpp @@ -87,20 +87,20 @@ DataPartsVector StorageMergeTreeFactory::getDataPartsByNames(const StorageID & i std::unordered_set missing_names; if (!datapart_map->has(table_name)) [[unlikely]] { - auto cache = std::make_shared>(config.table_part_metadata_cache_max_count); + auto cache = std::make_shared>(config.table_part_metadata_cache_max_count); datapart_map->add(table_name, cache); } // find the missing cache part name for (const auto & name : part_name) { - if (!(*(datapart_map->get(table_name)))->has(name)) + if (!(*datapart_map->get(table_name))->has(name)) { missing_names.emplace(name); } else { - res.emplace_back((*((*(datapart_map->get(table_name)))->get(name)))); + res.emplace_back((*datapart_map->get(table_name))->get(name)->get()->dataPart()); } } @@ -112,17 +112,17 @@ DataPartsVector StorageMergeTreeFactory::getDataPartsByNames(const StorageID & i storage_merge_tree = storage_map->get(table_name)->first; } auto missing_parts = storage_merge_tree->loadDataPartsWithNames(missing_names); - for (const auto & part : missing_parts) + for (auto & part : missing_parts) { res.emplace_back(part); - (*(datapart_map->get(table_name)))->add(part->name, part); + (*datapart_map->get(table_name))->add(part->name, std::make_shared(part, storage_merge_tree)); } } return res; } // will be inited in native init phase std::unique_ptr>> StorageMergeTreeFactory::storage_map = nullptr; -std::unique_ptr>>> StorageMergeTreeFactory::datapart_map = nullptr; +std::unique_ptr>>> StorageMergeTreeFactory::datapart_map = nullptr; std::recursive_mutex StorageMergeTreeFactory::storage_map_mutex; std::recursive_mutex StorageMergeTreeFactory::datapart_mutex; diff --git a/cpp-ch/local-engine/Storages/StorageMergeTreeFactory.h b/cpp-ch/local-engine/Storages/StorageMergeTreeFactory.h index 3fa8c6285bbeb..09a2d5747b26f 100644 --- a/cpp-ch/local-engine/Storages/StorageMergeTreeFactory.h +++ b/cpp-ch/local-engine/Storages/StorageMergeTreeFactory.h @@ -27,6 +27,37 @@ namespace local_engine { using CustomStorageMergeTreePtr = std::shared_ptr; +class DataPartStorageHolder +{ +public: + DataPartStorageHolder(const DataPartPtr& data_part, const CustomStorageMergeTreePtr& storage) + : data_part_(data_part), + storage_(storage) + { + } + + [[nodiscard]] DataPartPtr dataPart() const + { + return data_part_; + } + + [[nodiscard]] CustomStorageMergeTreePtr storage() const + { + return storage_; + } + + ~DataPartStorageHolder() + { + storage_->removePartFromMemory(*data_part_); + std::cerr << fmt::format("clean part {}", data_part_->name) << std::endl; + } + +private: + DataPartPtr data_part_; + CustomStorageMergeTreePtr storage_; +}; +using DataPartStorageHolderPtr = std::shared_ptr; + class StorageMergeTreeFactory { public: @@ -50,7 +81,7 @@ class StorageMergeTreeFactory auto & datapart_map_v = datapart_map; if (!datapart_map_v) { - datapart_map_v = std::make_unique>>>( + datapart_map_v = std::make_unique>>>( config.table_metadata_cache_max_count); } else @@ -68,7 +99,8 @@ class StorageMergeTreeFactory private: static std::unique_ptr>> storage_map; - static std::unique_ptr>>> datapart_map; + static std::unique_ptr>>> datapart_map; + static std::recursive_mutex storage_map_mutex; static std::recursive_mutex datapart_mutex; }; diff --git a/cpp-ch/local-engine/Storages/SubstraitSource/FormatFile.cpp b/cpp-ch/local-engine/Storages/SubstraitSource/FormatFile.cpp index e449ede988ee8..4499a9a559a10 100644 --- a/cpp-ch/local-engine/Storages/SubstraitSource/FormatFile.cpp +++ b/cpp-ch/local-engine/Storages/SubstraitSource/FormatFile.cpp @@ -55,17 +55,13 @@ FormatFile::FormatFile( : context(context_), file_info(file_info_), read_buffer_builder(read_buffer_builder_) { PartitionValues part_vals = GlutenStringUtils::parsePartitionTablePath(file_info.uri_file()); - String partition_values_str = "["; for (size_t i = 0; i < part_vals.size(); ++i) { const auto & part = part_vals[i]; partition_keys.push_back(part.first); partition_values[part.first] = part.second; - if (i > 0) - partition_values_str += ", "; - partition_values_str += part.first + "=" + part.second; } - partition_values_str += "]"; + LOG_INFO( &Poco::Logger::get("FormatFile"), "Reading File path: {}, format: {}, range: {}, partition_index: {}, partition_values: {}", @@ -73,7 +69,7 @@ FormatFile::FormatFile( file_info.file_format_case(), std::to_string(file_info.start()) + "-" + std::to_string(file_info.start() + file_info.length()), file_info.partition_index(), - partition_values_str); + GlutenStringUtils::dumpPartitionValues(part_vals)); } FormatFilePtr FormatFileUtil::createFile( diff --git a/cpp-ch/local-engine/Storages/SubstraitSource/ORCFormatFile.cpp b/cpp-ch/local-engine/Storages/SubstraitSource/ORCFormatFile.cpp index 1c57010751c0f..66556e237f77b 100644 --- a/cpp-ch/local-engine/Storages/SubstraitSource/ORCFormatFile.cpp +++ b/cpp-ch/local-engine/Storages/SubstraitSource/ORCFormatFile.cpp @@ -24,6 +24,7 @@ # include # include # include +# include namespace local_engine { @@ -67,7 +68,12 @@ FormatFile::InputFormatPtr ORCFormatFile::createInputFormat(const DB::Block & he std::back_inserter(skip_stripe_indices)); format_settings.orc.skip_stripes = std::unordered_set(skip_stripe_indices.begin(), skip_stripe_indices.end()); - + if (context->getConfigRef().has("timezone")) + { + const String config_timezone = context->getConfigRef().getString("timezone"); + const String mapped_timezone = DateTimeUtil::convertTimeZone(config_timezone); + format_settings.orc.reader_time_zone_name = mapped_timezone; + } auto input_format = std::make_shared(*file_format->read_buffer, header, format_settings); file_format->input = input_format; return file_format; diff --git a/cpp-ch/local-engine/Storages/SubstraitSource/ReadBufferBuilder.cpp b/cpp-ch/local-engine/Storages/SubstraitSource/ReadBufferBuilder.cpp index da15890070b09..b32073db53c4a 100644 --- a/cpp-ch/local-engine/Storages/SubstraitSource/ReadBufferBuilder.cpp +++ b/cpp-ch/local-engine/Storages/SubstraitSource/ReadBufferBuilder.cpp @@ -35,7 +35,7 @@ #include #include #include -#include +#include #include #include #include @@ -49,7 +49,6 @@ #include #include #include -#include #include #include @@ -77,8 +76,6 @@ namespace ErrorCodes } } -namespace fs = std::filesystem; - namespace local_engine { template @@ -205,6 +202,7 @@ class LocalFileReadBufferBuilder : public ReadBufferBuilder #if USE_HDFS class HDFSFileReadBufferBuilder : public ReadBufferBuilder { + using ReadBufferCreator = std::function(bool restricted_seek, const DB::StoredObject & object)>; public: explicit HDFSFileReadBufferBuilder(DB::ContextPtr context_) : ReadBufferBuilder(context_), context(context_) { } ~HDFSFileReadBufferBuilder() override = default; @@ -212,18 +210,21 @@ class HDFSFileReadBufferBuilder : public ReadBufferBuilder std::unique_ptr build(const substrait::ReadRel::LocalFiles::FileOrFiles & file_info, bool set_read_util_position) override { - auto config = HdfsConfig::loadFromContext(context); + DB::ReadSettings read_settings = getReadSettings(context); + auto & config = context->getConfigRef(); + auto hdfs_config = HdfsConfig::loadFromContext(config, read_settings); Poco::URI file_uri(file_info.uri_file()); std::string uri_path = "hdfs://" + file_uri.getHost(); if (file_uri.getPort()) - uri_path += ":" + std::to_string(file_uri.getPort()); + uri_path += ":" + std::to_string(static_cast(file_uri.getPort())); - DB::ReadSettings read_settings; - std::unique_ptr read_buffer; + size_t read_util_position = 0; + size_t read_begin = 0; if (set_read_util_position) { std::pair start_end_pos - = adjustFileReadStartAndEndPos(file_info.start(), file_info.start() + file_info.length(), uri_path, file_uri.getPath()); + = adjustFileReadStartAndEndPos(file_info.start(), file_info.start() + file_info.length(), uri_path, file_uri.getPath()); + LOG_DEBUG( &Poco::Logger::get("ReadBufferBuilder"), "File read start and end position adjusted from {},{} to {},{}", @@ -232,34 +233,57 @@ class HDFSFileReadBufferBuilder : public ReadBufferBuilder start_end_pos.first, start_end_pos.second); - auto read_buffer_impl = std::make_unique( - uri_path, file_uri.getPath(), context->getConfigRef(), read_settings, start_end_pos.second, true); - if (config.hdfs_async) - { - auto & pool_reader = context->getThreadPoolReader(DB::FilesystemReaderType::ASYNCHRONOUS_REMOTE_FS_READER); - read_buffer = std::make_unique(pool_reader, read_settings, std::move(read_buffer_impl)); - } - else - read_buffer = std::move(read_buffer_impl); + read_begin = start_end_pos.first; + read_util_position = start_end_pos.second; + } - if (auto * seekable_in = dynamic_cast(read_buffer.get())) - if (start_end_pos.first) - seekable_in->seek(start_end_pos.first, SEEK_SET); + size_t file_size = 0; + if (file_info.has_properties()) + file_size = file_info.properties().filesize(); + + std::unique_ptr read_buffer; + + if (hdfs_config.hdfs_async) + { + std::optional size = std::nullopt; + if (file_size) + size = file_size; + + auto read_buffer_impl = std::make_shared( + uri_path, file_uri.getPath(), config, read_settings, read_util_position, true, size); + auto & pool_reader = context->getThreadPoolReader(DB::FilesystemReaderType::ASYNCHRONOUS_REMOTE_FS_READER); + read_buffer = std::make_unique(pool_reader, read_settings, std::move(read_buffer_impl)); } else { - auto read_buffer_impl - = std::make_unique(uri_path, file_uri.getPath(), context->getConfigRef(), read_settings, 0, true); - if (config.hdfs_async) + if (!file_size) { - read_buffer = std::make_unique( - context->getThreadPoolReader(DB::FilesystemReaderType::ASYNCHRONOUS_REMOTE_FS_READER), - read_settings, - std::move(read_buffer_impl)); + // only for spark3.2 file partition not contained file size + // so first compute file size first + auto read_buffer_impl = std::make_unique( + uri_path, file_uri.getPath(), config, read_settings, read_util_position, true); + file_size = read_buffer_impl->getFileSize(); } - else - read_buffer = std::move(read_buffer_impl); + + ReadBufferCreator hdfs_read_buffer_creator + = [this, hdfs_uri = uri_path, hdfs_file_path = file_uri.getPath(), read_settings, &config, read_util_position]( + bool /* restricted_seek */, const DB::StoredObject & object) -> std::unique_ptr + { + return std::make_unique( + hdfs_uri, hdfs_file_path, config, read_settings, read_util_position, true, object.bytes_size); + }; + + DB::StoredObjects stored_objects{DB::StoredObject{file_uri.getPath().substr(1), "", file_size}}; + auto cache_hdfs_read = std::make_unique( + std::move(hdfs_read_buffer_creator), stored_objects, "hdfs:", read_settings, nullptr, /* use_external_buffer */ false); + cache_hdfs_read->setReadUntilPosition(read_util_position); + read_buffer = std::move(cache_hdfs_read); } + + if (set_read_util_position && read_begin) + if (auto * seekable_in = dynamic_cast(read_buffer.get())) + seekable_in->seek(read_begin, SEEK_SET); + return read_buffer; } @@ -367,6 +391,7 @@ class HDFSFileReadBufferBuilder : public ReadBufferBuilder result.second = get_next_line_pos(fs.get(), fin, read_end_pos, hdfs_file_size); return result; } + private: DB::ContextPtr context; }; @@ -382,23 +407,19 @@ class S3FileReadBufferBuilder : public ReadBufferBuilder explicit S3FileReadBufferBuilder(DB::ContextPtr context_) : ReadBufferBuilder(context_) { auto config = S3Config::loadFromContext(context); - new_settings = context->getReadSettings(); - new_settings.enable_filesystem_cache = config.s3_local_cache_enabled; - - if (new_settings.enable_filesystem_cache) + // use gluten cache config is first priority + if (!file_cache && config.s3_local_cache_enabled) { DB::FileCacheSettings file_cache_settings; file_cache_settings.max_size = config.s3_local_cache_max_size; auto cache_base_path = config.s3_local_cache_cache_path; - if (!fs::exists(cache_base_path)) - fs::create_directories(cache_base_path); + if (!std::filesystem::exists(cache_base_path)) + std::filesystem::create_directories(cache_base_path); file_cache_settings.base_path = cache_base_path; file_cache = DB::FileCacheFactory::instance().getOrCreate("s3_local_cache", file_cache_settings, ""); file_cache->initialize(); - - new_settings.remote_fs_cache = file_cache; } } @@ -407,6 +428,7 @@ class S3FileReadBufferBuilder : public ReadBufferBuilder std::unique_ptr build(const substrait::ReadRel::LocalFiles::FileOrFiles & file_info, bool set_read_util_position) override { + DB::ReadSettings read_settings = getReadSettings(context); Poco::URI file_uri(file_info.uri_file()); // file uri looks like: s3a://my-dev-bucket/tpch100/part/0001.parquet const std::string& bucket = file_uri.getHost(); @@ -416,9 +438,8 @@ class S3FileReadBufferBuilder : public ReadBufferBuilder size_t object_size = object_info.size; Int64 object_modified_time = object_info.last_modification_time; - if (new_settings.enable_filesystem_cache) + if (read_settings.enable_filesystem_cache) { - auto file_cache_key = DB::FileCacheKey(key); auto last_cache_time = files_cache_time_map.get(file_cache_key); // quick check @@ -436,7 +457,7 @@ class S3FileReadBufferBuilder : public ReadBufferBuilder } auto read_buffer_creator - = [bucket, client, this](bool restricted_seek, const DB::StoredObject & object) -> std::unique_ptr + = [bucket, client, read_settings, this](bool restricted_seek, const DB::StoredObject & object) -> std::unique_ptr { return std::make_unique( client, @@ -444,7 +465,7 @@ class S3FileReadBufferBuilder : public ReadBufferBuilder object.remote_path, "", DB::S3::RequestSettings(), - new_settings, + read_settings, /* use_external_buffer */ true, /* offset */ 0, /* read_until_position */0, @@ -453,11 +474,11 @@ class S3FileReadBufferBuilder : public ReadBufferBuilder DB::StoredObjects stored_objects{DB::StoredObject{key, "", object_size}}; auto s3_impl = std::make_unique( - std::move(read_buffer_creator), stored_objects, "s3:" + bucket + "/", new_settings, /* cache_log */ nullptr, /* use_external_buffer */ true); + std::move(read_buffer_creator), stored_objects, "s3:" + bucket + "/", read_settings, /* cache_log */ nullptr, /* use_external_buffer */ true); auto & pool_reader = context->getThreadPoolReader(DB::FilesystemReaderType::ASYNCHRONOUS_REMOTE_FS_READER); auto async_reader - = std::make_unique(std::move(s3_impl), pool_reader, new_settings, nullptr, nullptr); + = std::make_unique(std::move(s3_impl), pool_reader, read_settings, nullptr, nullptr); if (set_read_util_position) { @@ -478,7 +499,7 @@ class S3FileReadBufferBuilder : public ReadBufferBuilder async_reader->setReadUntilEnd(); } - if (new_settings.remote_fs_prefetch) + if (read_settings.remote_fs_prefetch) async_reader->prefetch(Priority{}); return async_reader; @@ -488,7 +509,6 @@ class S3FileReadBufferBuilder : public ReadBufferBuilder static const std::string SHARED_CLIENT_KEY; static ConcurrentLRU> per_bucket_clients; static FileCacheConcurrentMap files_cache_time_map; - DB::ReadSettings new_settings; DB::FileCachePtr file_cache; std::string & stripQuote(std::string & s) @@ -732,6 +752,57 @@ void registerReadBufferBuilders() #endif } +ReadBufferBuilder::ReadBufferBuilder(DB::ContextPtr context_) : context(context_) +{ + const auto & config = context->getConfigRef(); + if (config.getBool("gluten_cache.local.enabled", false)) + { + DB::FileCacheSettings file_cache_settings; + + file_cache_settings.loadFromConfig(config, "gluten_cache.local"); + + if (std::filesystem::path(file_cache_settings.base_path).is_relative()) + file_cache_settings.base_path = std::filesystem::path(context->getPath()) / "caches" / file_cache_settings.base_path; + + if (!std::filesystem::exists(file_cache_settings.base_path)) + std::filesystem::create_directories(file_cache_settings.base_path); + + auto name = config.getString("gluten_cache.local.name"); + auto * config_prefix = ""; + file_cache = DB::FileCacheFactory::instance().getOrCreate(name, file_cache_settings, config_prefix); + file_cache->initialize(); + } +} + +DB::ReadSettings ReadBufferBuilder::getReadSettings(DB::ContextPtr context) const +{ + DB::ReadSettings read_settings = context->getReadSettings(); + if (file_cache) + { + read_settings.enable_filesystem_cache = true; + read_settings.remote_fs_cache = file_cache; + } + else + { + read_settings.enable_filesystem_cache = false; + } + + return read_settings; +} + + +std::unique_ptr +ReadBufferBuilder::buildWithCompressionWrapper(const substrait::ReadRel::LocalFiles::FileOrFiles & file_info, bool set_read_util_position) +{ + auto in = build(file_info, set_read_util_position); + + /// Wrap the read buffer with compression method if exists + Poco::URI file_uri(file_info.uri_file()); + DB::CompressionMethod compression = DB::chooseCompressionMethod(file_uri.getPath(), "auto"); + return compression != DB::CompressionMethod::None ? DB::wrapReadBufferWithCompressionMethod(std::move(in), compression) : std::move(in); +} + + ReadBufferBuilderFactory & ReadBufferBuilderFactory::instance() { static ReadBufferBuilderFactory instance; diff --git a/cpp-ch/local-engine/Storages/SubstraitSource/ReadBufferBuilder.h b/cpp-ch/local-engine/Storages/SubstraitSource/ReadBufferBuilder.h index f5218f0aa5def..92d8d41c1290f 100644 --- a/cpp-ch/local-engine/Storages/SubstraitSource/ReadBufferBuilder.h +++ b/cpp-ch/local-engine/Storages/SubstraitSource/ReadBufferBuilder.h @@ -15,21 +15,21 @@ * limitations under the License. */ #pragma once + #include #include #include -#include -#include -#include #include -#include + namespace local_engine { + class ReadBufferBuilder { public: - explicit ReadBufferBuilder(DB::ContextPtr context_) : context(context_) { } + explicit ReadBufferBuilder(DB::ContextPtr context_); + virtual ~ReadBufferBuilder() = default; /// build a new read buffer @@ -37,19 +37,14 @@ class ReadBufferBuilder build(const substrait::ReadRel::LocalFiles::FileOrFiles & file_info, bool set_read_util_position = false) = 0; /// build a new read buffer, consider compression method - std::unique_ptr buildWithCompressionWrapper(const substrait::ReadRel::LocalFiles::FileOrFiles & file_info, bool set_read_util_position = false) - { - auto in = build(file_info, set_read_util_position); - - /// Wrap the read buffer with compression method if exists - Poco::URI file_uri(file_info.uri_file()); - DB::CompressionMethod compression = DB::chooseCompressionMethod(file_uri.getPath(), "auto"); - return compression != DB::CompressionMethod::None ? DB::wrapReadBufferWithCompressionMethod(std::move(in), compression) - : std::move(in); - } + std::unique_ptr buildWithCompressionWrapper(const substrait::ReadRel::LocalFiles::FileOrFiles & file_info, bool set_read_util_position = false); protected: + DB::ReadSettings getReadSettings(DB::ContextPtr context) const; DB::ContextPtr context; + +public: + DB::FileCachePtr file_cache = nullptr; }; using ReadBufferBuilderPtr = std::shared_ptr; diff --git a/cpp-ch/local-engine/Storages/SubstraitSource/SubstraitFileSource.cpp b/cpp-ch/local-engine/Storages/SubstraitSource/SubstraitFileSource.cpp index d4e9f1eb8d4b3..d8f0ee0e35527 100644 --- a/cpp-ch/local-engine/Storages/SubstraitSource/SubstraitFileSource.cpp +++ b/cpp-ch/local-engine/Storages/SubstraitSource/SubstraitFileSource.cpp @@ -144,7 +144,7 @@ bool SubstraitFileSource::tryPrepareReader() } -void SubstraitFileSource::onCancel() +void SubstraitFileSource::onCancel() noexcept { if (file_reader) file_reader->cancel(); diff --git a/cpp-ch/local-engine/Storages/SubstraitSource/SubstraitFileSource.h b/cpp-ch/local-engine/Storages/SubstraitSource/SubstraitFileSource.h index 571e4097107ac..113538a929224 100644 --- a/cpp-ch/local-engine/Storages/SubstraitSource/SubstraitFileSource.h +++ b/cpp-ch/local-engine/Storages/SubstraitSource/SubstraitFileSource.h @@ -131,7 +131,7 @@ class SubstraitFileSource : public DB::SourceWithKeyCondition private: bool tryPrepareReader(); - void onCancel() override; + void onCancel() noexcept override; DB::ContextPtr context; DB::Block output_header; /// Sample header may contains partitions keys diff --git a/cpp-ch/local-engine/local_engine_jni.cpp b/cpp-ch/local-engine/local_engine_jni.cpp index c4e8ec67b106a..c80379a879f87 100644 --- a/cpp-ch/local-engine/local_engine_jni.cpp +++ b/cpp-ch/local-engine/local_engine_jni.cpp @@ -163,6 +163,7 @@ JNIEXPORT jint JNI_OnLoad(JavaVM * vm, void * /*reserved*/) env, local_engine::SparkRowToCHColumn::spark_row_interator_class, "nextBatch", "()Ljava/nio/ByteBuffer;"); local_engine::BroadCastJoinBuilder::init(env); + local_engine::CacheManager::initJNI(env); local_engine::JNIUtils::vm = vm; return JNI_VERSION_1_8; @@ -544,6 +545,7 @@ JNIEXPORT jlong Java_org_apache_gluten_vectorized_CHShuffleSplitterJniWrapper_na jlong map_id, jint split_size, jstring codec, + jint compress_level, jstring data_file, jstring local_dirs, jint num_sub_dirs, @@ -585,6 +587,7 @@ JNIEXPORT jlong Java_org_apache_gluten_vectorized_CHShuffleSplitterJniWrapper_na .hash_exprs = hash_exprs, .out_exprs = out_exprs, .compress_method = jstring2string(env, codec), + .compress_level = compress_level < 0 ? std::nullopt : std::optional(compress_level), .spill_threshold = static_cast(spill_threshold), .hash_algorithm = jstring2string(env, hash_algorithm), .max_sort_buffer_size = static_cast(max_sort_buffer_size), @@ -606,6 +609,7 @@ JNIEXPORT jlong Java_org_apache_gluten_vectorized_CHShuffleSplitterJniWrapper_na jlong map_id, jint split_size, jstring codec, + jint compress_level, jlong spill_threshold, jstring hash_algorithm, jobject pusher, @@ -637,6 +641,7 @@ JNIEXPORT jlong Java_org_apache_gluten_vectorized_CHShuffleSplitterJniWrapper_na .hash_exprs = hash_exprs, .out_exprs = out_exprs, .compress_method = jstring2string(env, codec), + .compress_level = compress_level < 0 ? std::nullopt : std::optional(compress_level), .spill_threshold = static_cast(spill_threshold), .hash_algorithm = jstring2string(env, hash_algorithm), .force_memory_sort = static_cast(force_memory_sort)}; @@ -673,6 +678,11 @@ JNIEXPORT jobject Java_org_apache_gluten_vectorized_CHShuffleSplitterJniWrapper_ const auto * raw_src = reinterpret_cast(raw_partition_lengths.data()); env->SetLongArrayRegion(raw_partition_length_arr, 0, raw_partition_lengths.size(), raw_src); + // AQE has dependency on total_bytes_written, if the data is wrong, it will generate inappropriate plan + // add a log here for remining this. + if (!result.total_bytes_written) + LOG_WARNING(getLogger("CHShuffleSplitterJniWrapper"), "total_bytes_written is 0, something may be wrong"); + jobject split_result = env->NewObject( split_result_class, split_result_constructor, @@ -995,6 +1005,8 @@ JNIEXPORT jstring Java_org_apache_spark_sql_execution_datasources_CHDatasourceJn // each task using its own CustomStorageMergeTree, don't reuse auto temp_storage = local_engine::MergeTreeRelParser::copyToVirtualStorage(merge_tree_table, context); + // prefetch all needed parts metadata before merge + local_engine::restoreMetaData(temp_storage, merge_tree_table, *context); local_engine::TempStorageFreer freer{temp_storage->getStorageID()}; // to release temp CustomStorageMergeTree with RAII std::vector selected_parts = local_engine::StorageMergeTreeFactory::instance().getDataPartsByNames( @@ -1158,11 +1170,11 @@ JNIEXPORT jint Java_org_apache_gluten_vectorized_BlockSplitIterator_nativeNextPa } JNIEXPORT jlong Java_org_apache_gluten_vectorized_BlockOutputStream_nativeCreate( - JNIEnv * env, jobject, jobject output_stream, jbyteArray buffer, jstring codec, jboolean compressed, jint customize_buffer_size) + JNIEnv * env, jobject, jobject output_stream, jbyteArray buffer, jstring codec, jint level, jboolean compressed, jint customize_buffer_size) { LOCAL_ENGINE_JNI_METHOD_START local_engine::ShuffleWriter * writer - = new local_engine::ShuffleWriter(output_stream, buffer, jstring2string(env, codec), compressed, customize_buffer_size); + = new local_engine::ShuffleWriter(output_stream, buffer, jstring2string(env, codec), level, compressed, customize_buffer_size); return reinterpret_cast(writer); LOCAL_ENGINE_JNI_METHOD_END(env, -1) } @@ -1263,7 +1275,7 @@ JNIEXPORT void Java_org_apache_gluten_utils_TestExceptionUtils_generateNativeExc -JNIEXPORT void Java_org_apache_gluten_execution_CHNativeCacheManager_nativeCacheParts(JNIEnv * env, jobject, jstring table_, jstring columns_, jboolean async_) +JNIEXPORT jstring Java_org_apache_gluten_execution_CHNativeCacheManager_nativeCacheParts(JNIEnv * env, jobject, jstring table_, jstring columns_) { LOCAL_ENGINE_JNI_METHOD_START auto table_def = jstring2string(env, table_); @@ -1274,7 +1286,38 @@ JNIEXPORT void Java_org_apache_gluten_execution_CHNativeCacheManager_nativeCache { column_set.insert(col); } - local_engine::CacheManager::instance().cacheParts(table_def, column_set, async_); + auto id = local_engine::CacheManager::instance().cacheParts(table_def, column_set); + return local_engine::charTojstring(env, id.c_str()); + LOCAL_ENGINE_JNI_METHOD_END(env, nullptr); +} + +JNIEXPORT jobject Java_org_apache_gluten_execution_CHNativeCacheManager_nativeGetCacheStatus(JNIEnv * env, jobject, jstring id) +{ + LOCAL_ENGINE_JNI_METHOD_START + return local_engine::CacheManager::instance().getCacheStatus(env, jstring2string(env, id)); + LOCAL_ENGINE_JNI_METHOD_END(env, nullptr); +} + +JNIEXPORT jstring Java_org_apache_gluten_execution_CHNativeCacheManager_nativeCacheFiles(JNIEnv * env, jobject, jbyteArray files) +{ + LOCAL_ENGINE_JNI_METHOD_START + const auto files_bytes = local_engine::getByteArrayElementsSafe(env, files); + const std::string::size_type files_bytes_size = files_bytes.length(); + std::string_view files_view = {reinterpret_cast(files_bytes.elems()), files_bytes_size}; + substrait::ReadRel::LocalFiles local_files = local_engine::BinaryToMessage(files_view); + + auto jobId = local_engine::CacheManager::instance().cacheFiles(local_files); + return local_engine::charTojstring(env, jobId.c_str()); + LOCAL_ENGINE_JNI_METHOD_END(env, nullptr); +} + +JNIEXPORT void Java_org_apache_gluten_execution_CHNativeCacheManager_removeFiles(JNIEnv * env, jobject, jstring file_, jstring cache_name_) +{ + LOCAL_ENGINE_JNI_METHOD_START + auto file = jstring2string(env, file_); + auto cache_name = jstring2string(env, cache_name_); + + local_engine::CacheManager::removeFiles(file, cache_name); LOCAL_ENGINE_JNI_METHOD_END(env, ); } diff --git a/cpp-ch/local-engine/tests/data/68131.parquet b/cpp-ch/local-engine/tests/data/68131.parquet new file mode 100644 index 0000000000000..169f6152003db Binary files /dev/null and b/cpp-ch/local-engine/tests/data/68131.parquet differ diff --git a/cpp-ch/local-engine/tests/data/68135.snappy.parquet b/cpp-ch/local-engine/tests/data/68135.snappy.parquet new file mode 100644 index 0000000000000..ddd627790cd1b Binary files /dev/null and b/cpp-ch/local-engine/tests/data/68135.snappy.parquet differ diff --git a/cpp-ch/local-engine/tests/decmial_filter_push_down/18_2.json b/cpp-ch/local-engine/tests/decmial_filter_push_down/18_2.json new file mode 100644 index 0000000000000..5ad0a62325def --- /dev/null +++ b/cpp-ch/local-engine/tests/decmial_filter_push_down/18_2.json @@ -0,0 +1,160 @@ +{ + "relations": [ + { + "root": { + "input": { + "filter": { + "common": { + "direct": {} + }, + "input": { + "read": { + "common": { + "direct": {} + }, + "baseSchema": { + "names": [ + "a" + ], + "struct": { + "types": [ + { + "decimal": { + "scale": 2, + "precision": 18, + "nullability": "NULLABILITY_NULLABLE" + } + } + ] + }, + "columnTypes": [ + "NORMAL_COL" + ] + }, + "filter": { + "singularOrList": { + "value": { + "selection": { + "directReference": { + "structField": {} + } + } + }, + "options": [ + { + "literal": { + "decimal": { + "value": "yAAAAAAAAAAAAAAAAAAAAA==", + "precision": 18, + "scale": 2 + } + } + }, + { + "literal": { + "decimal": { + "value": "LAEAAAAAAAAAAAAAAAAAAA==", + "precision": 18, + "scale": 2 + } + } + }, + { + "literal": { + "decimal": { + "value": "kAEAAAAAAAAAAAAAAAAAAA==", + "precision": 18, + "scale": 2 + } + } + }, + { + "literal": { + "decimal": { + "value": "9AEAAAAAAAAAAAAAAAAAAA==", + "precision": 18, + "scale": 2 + } + } + } + ] + } + }, + "advancedExtension": { + "optimization": { + "@type": "type.googleapis.com/google.protobuf.StringValue", + "value": "isMergeTree=0\n" + } + } + } + }, + "condition": { + "singularOrList": { + "value": { + "selection": { + "directReference": { + "structField": {} + } + } + }, + "options": [ + { + "literal": { + "decimal": { + "value": "yAAAAAAAAAAAAAAAAAAAAA==", + "precision": 18, + "scale": 2 + } + } + }, + { + "literal": { + "decimal": { + "value": "LAEAAAAAAAAAAAAAAAAAAA==", + "precision": 18, + "scale": 2 + } + } + }, + { + "literal": { + "decimal": { + "value": "kAEAAAAAAAAAAAAAAAAAAA==", + "precision": 18, + "scale": 2 + } + } + }, + { + "literal": { + "decimal": { + "value": "9AEAAAAAAAAAAAAAAAAAAA==", + "precision": 18, + "scale": 2 + } + } + } + ] + } + } + } + }, + "names": [ + "a#4772" + ], + "outputSchema": { + "types": [ + { + "decimal": { + "scale": 2, + "precision": 18, + "nullability": "NULLABILITY_NULLABLE" + } + } + ], + "nullability": "NULLABILITY_REQUIRED" + } + } + } + ] +} \ No newline at end of file diff --git a/cpp-ch/local-engine/tests/decmial_filter_push_down/18_2_flba.snappy.parquet b/cpp-ch/local-engine/tests/decmial_filter_push_down/18_2_flba.snappy.parquet new file mode 100644 index 0000000000000..ac0b015900dfc Binary files /dev/null and b/cpp-ch/local-engine/tests/decmial_filter_push_down/18_2_flba.snappy.parquet differ diff --git a/cpp-ch/local-engine/tests/gtest_clickhouse_pr_verify.cpp b/cpp-ch/local-engine/tests/gtest_clickhouse_pr_verify.cpp index 6352a819927c6..5b5797ed7d21d 100644 --- a/cpp-ch/local-engine/tests/gtest_clickhouse_pr_verify.cpp +++ b/cpp-ch/local-engine/tests/gtest_clickhouse_pr_verify.cpp @@ -34,8 +34,8 @@ TEST(Clickhouse, PR54881) { const auto context1 = DB::Context::createCopy(SerializedPlanParser::global_context); // context1->setSetting("enable_named_columns_in_function_tuple", DB::Field(true)); - auto settingxs = context1->getSettingsRef(); - EXPECT_FALSE(settingxs.enable_named_columns_in_function_tuple) << "GLUTEN NEED set enable_named_columns_in_function_tuple to false"; + auto settings = context1->getSettingsRef(); + EXPECT_FALSE(settings.enable_named_columns_in_function_tuple) << "GLUTEN NEED set enable_named_columns_in_function_tuple to false"; const std::string split_template = R"({"items":[{"uriFile":"{replace_local_files}","partitionIndex":"0","length":"1529","parquet":{},"schema":{},"metadataColumns":[{}]}]})"; @@ -63,7 +63,7 @@ TEST(Clickhouse, PR54881) Field field; const auto & col_1 = *(block.getColumns()[1]); col_1.get(0, field); - const Tuple & row_0 = field.get(); + const Tuple & row_0 = field.safeGet(); EXPECT_EQ(2, row_0.size()); Int64 actual{-1}; @@ -74,7 +74,7 @@ TEST(Clickhouse, PR54881) EXPECT_EQ(10, actual); col_1.get(1, field); - const Tuple & row_1 = field.get(); + const Tuple & row_1 = field.safeGet(); EXPECT_EQ(2, row_1.size()); EXPECT_TRUE(row_1[0].tryGet(actual)); EXPECT_EQ(10, actual); @@ -96,4 +96,44 @@ TEST(Clickhouse, PR65234) const auto plan = local_engine::JsonStringToMessage( {reinterpret_cast(gresource_embedded_pr_65234_jsonData), gresource_embedded_pr_65234_jsonSize}); auto query_plan = parser.parse(plan); +} + +INCBIN(resource_embedded_pr_68135_json, SOURCE_DIR "/utils/extern-local-engine/tests/json/clickhouse_pr_68135.json"); +TEST(Clickhouse, PR68135) +{ + const std::string split_template + = R"({"items":[{"uriFile":"{replace_local_files}","partitionIndex":"0","length":"461","parquet":{},"schema":{},"metadataColumns":[{}]}]})"; + const std::string split + = replaceLocalFilesWildcards(split_template, GLUTEN_DATA_DIR("/utils/extern-local-engine/tests/data/68135.snappy.parquet")); + + SerializedPlanParser parser(SerializedPlanParser::global_context); + parser.addSplitInfo(local_engine::JsonStringToBinary(split)); + + const auto plan = local_engine::JsonStringToMessage( + {reinterpret_cast(gresource_embedded_pr_68135_jsonData), gresource_embedded_pr_68135_jsonSize}); + + auto local_executor = parser.createExecutor(plan); + EXPECT_TRUE(local_executor->hasNext()); + const Block & x = *local_executor->nextColumnar(); + debug::headBlock(x); +} + +INCBIN(resource_embedded_pr_68131_json, SOURCE_DIR "/utils/extern-local-engine/tests/json/clickhouse_pr_68131.json"); +TEST(Clickhouse, PR68131) +{ + const std::string split_template + = R"({"items":[{"uriFile":"{replace_local_files}","partitionIndex":"0","length":"289","parquet":{},"schema":{},"metadataColumns":[{}]}]})"; + const std::string split + = replaceLocalFilesWildcards(split_template, GLUTEN_DATA_DIR("/utils/extern-local-engine/tests/data/68131.parquet")); + + SerializedPlanParser parser(SerializedPlanParser::global_context); + parser.addSplitInfo(local_engine::JsonStringToBinary(split)); + + const auto plan = local_engine::JsonStringToMessage( + {reinterpret_cast(gresource_embedded_pr_68131_jsonData), gresource_embedded_pr_68131_jsonSize}); + + auto local_executor = parser.createExecutor(plan); + EXPECT_TRUE(local_executor->hasNext()); + const Block & x = *local_executor->nextColumnar(); + debug::headBlock(x); } \ No newline at end of file diff --git a/cpp-ch/local-engine/tests/gtest_parquet_columnindex_bug.cpp b/cpp-ch/local-engine/tests/gtest_parquet_columnindex_bug.cpp new file mode 100644 index 0000000000000..ee6e70305b27b --- /dev/null +++ b/cpp-ch/local-engine/tests/gtest_parquet_columnindex_bug.cpp @@ -0,0 +1,55 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include +#include +#include +#include +#include +#include +#include +#include + + +using namespace local_engine; + +using namespace DB; + +INCBIN(resource_embedded_pr_18_2_json, SOURCE_DIR "/utils/extern-local-engine/tests/decmial_filter_push_down/18_2.json"); +TEST(ColumnIndex, Deciaml182) +{ + // [precision,scale] = [18,2] + const auto context1 = DB::Context::createCopy(SerializedPlanParser::global_context); + + auto config = ExecutorConfig::loadFromContext(context1); + EXPECT_TRUE(config.use_local_format) << "gtest need set use_local_format to true"; + + const std::string split_template + = R"({"items":[{"uriFile":"{replace_local_files}","partitionIndex":"0","length":"488","parquet":{},"schema":{},"metadataColumns":[{}]}]})"; + const std::string split = replaceLocalFilesWildcards( + split_template, GLUTEN_DATA_DIR("/utils/extern-local-engine/tests/decmial_filter_push_down/18_2_flba.snappy.parquet")); + + SerializedPlanParser parser(context1); + parser.addSplitInfo(local_engine::JsonStringToBinary(split)); + + const auto plan = local_engine::JsonStringToMessage( + {reinterpret_cast(gresource_embedded_pr_18_2_jsonData), gresource_embedded_pr_18_2_jsonSize}); + + auto local_executor = parser.createExecutor(plan); + EXPECT_TRUE(local_executor->hasNext()); + const Block & x = *local_executor->nextColumnar(); + debug::headBlock(x); +} \ No newline at end of file diff --git a/cpp-ch/local-engine/tests/gtest_spark_row.cpp b/cpp-ch/local-engine/tests/gtest_spark_row.cpp index 0350bb16c705d..963f7736858f5 100644 --- a/cpp-ch/local-engine/tests/gtest_spark_row.cpp +++ b/cpp-ch/local-engine/tests/gtest_spark_row.cpp @@ -136,7 +136,7 @@ TEST(SparkRow, GetArrayElementSize) {std::make_shared(), 4}, {std::make_shared(), 4}, {std::make_shared(), 4}, - {std::make_shared(9, 4), 4}, + {std::make_shared(9, 4), 8}, {std::make_shared(), 8}, {std::make_shared(), 8}, {std::make_shared(), 8}, @@ -152,11 +152,11 @@ TEST(SparkRow, GetArrayElementSize) for (const auto & [type, size] : type_to_size) { - EXPECT_TRUE(BackingDataLengthCalculator::getArrayElementSize(type) == size); + EXPECT_EQ(size, BackingDataLengthCalculator::getArrayElementSize(type)); if (type->canBeInsideNullable()) { const auto type_with_nullable = std::make_shared(type); - EXPECT_TRUE(BackingDataLengthCalculator::getArrayElementSize(type_with_nullable) == size); + EXPECT_EQ(size, BackingDataLengthCalculator::getArrayElementSize(type_with_nullable)); } } } diff --git a/cpp-ch/local-engine/tests/json/clickhouse_pr_68131.json b/cpp-ch/local-engine/tests/json/clickhouse_pr_68131.json new file mode 100644 index 0000000000000..0add2092b8174 --- /dev/null +++ b/cpp-ch/local-engine/tests/json/clickhouse_pr_68131.json @@ -0,0 +1,95 @@ +{ + "extensions": [ + { + "extensionFunction": { + "name": "is_not_null:list" + } + } + ], + "relations": [ + { + "root": { + "input": { + "filter": { + "common": { + "direct": {} + }, + "input": { + "read": { + "common": { + "direct": {} + }, + "baseSchema": { + "names": [ + "f" + ], + "struct": { + "types": [ + { + "list": { + "type": { + "i32": { + "nullability": "NULLABILITY_NULLABLE" + } + }, + "nullability": "NULLABILITY_NULLABLE" + } + } + ] + }, + "columnTypes": [ + "NORMAL_COL" + ] + }, + "advancedExtension": { + "optimization": { + "@type": "type.googleapis.com/google.protobuf.StringValue", + "value": "isMergeTree=0\n" + } + } + } + }, + "condition": { + "scalarFunction": { + "outputType": { + "bool": { + "nullability": "NULLABILITY_REQUIRED" + } + }, + "arguments": [ + { + "value": { + "selection": { + "directReference": { + "structField": {} + } + } + } + } + ] + } + } + } + }, + "names": [ + "f#0" + ], + "outputSchema": { + "types": [ + { + "list": { + "type": { + "i32": { + "nullability": "NULLABILITY_NULLABLE" + } + }, + "nullability": "NULLABILITY_REQUIRED" + } + } + ], + "nullability": "NULLABILITY_REQUIRED" + } + } + } + ] +} \ No newline at end of file diff --git a/cpp-ch/local-engine/tests/json/clickhouse_pr_68135.json b/cpp-ch/local-engine/tests/json/clickhouse_pr_68135.json new file mode 100644 index 0000000000000..c8b49857c79a7 --- /dev/null +++ b/cpp-ch/local-engine/tests/json/clickhouse_pr_68135.json @@ -0,0 +1,160 @@ +{ + "relations": [ + { + "root": { + "input": { + "filter": { + "common": { + "direct": {} + }, + "input": { + "read": { + "common": { + "direct": {} + }, + "baseSchema": { + "names": [ + "a" + ], + "struct": { + "types": [ + { + "decimal": { + "scale": 2, + "precision": 9, + "nullability": "NULLABILITY_NULLABLE" + } + } + ] + }, + "columnTypes": [ + "NORMAL_COL" + ] + }, + "filter": { + "singularOrList": { + "value": { + "selection": { + "directReference": { + "structField": {} + } + } + }, + "options": [ + { + "literal": { + "decimal": { + "value": "yAAAAAAAAAAAAAAAAAAAAA==", + "precision": 9, + "scale": 2 + } + } + }, + { + "literal": { + "decimal": { + "value": "LAEAAAAAAAAAAAAAAAAAAA==", + "precision": 9, + "scale": 2 + } + } + }, + { + "literal": { + "decimal": { + "value": "kAEAAAAAAAAAAAAAAAAAAA==", + "precision": 9, + "scale": 2 + } + } + }, + { + "literal": { + "decimal": { + "value": "9AEAAAAAAAAAAAAAAAAAAA==", + "precision": 9, + "scale": 2 + } + } + } + ] + } + }, + "advancedExtension": { + "optimization": { + "@type": "type.googleapis.com/google.protobuf.StringValue", + "value": "isMergeTree=0\n" + } + } + } + }, + "condition": { + "singularOrList": { + "value": { + "selection": { + "directReference": { + "structField": {} + } + } + }, + "options": [ + { + "literal": { + "decimal": { + "value": "yAAAAAAAAAAAAAAAAAAAAA==", + "precision": 9, + "scale": 2 + } + } + }, + { + "literal": { + "decimal": { + "value": "LAEAAAAAAAAAAAAAAAAAAA==", + "precision": 9, + "scale": 2 + } + } + }, + { + "literal": { + "decimal": { + "value": "kAEAAAAAAAAAAAAAAAAAAA==", + "precision": 9, + "scale": 2 + } + } + }, + { + "literal": { + "decimal": { + "value": "9AEAAAAAAAAAAAAAAAAAAA==", + "precision": 9, + "scale": 2 + } + } + } + ] + } + } + } + }, + "names": [ + "a#26" + ], + "outputSchema": { + "types": [ + { + "decimal": { + "scale": 2, + "precision": 9, + "nullability": "NULLABILITY_NULLABLE" + } + } + ], + "nullability": "NULLABILITY_REQUIRED" + } + } + } + ] +} \ No newline at end of file diff --git a/cpp-ch/local-engine/tests/json/gtest_local_engine_config.json b/cpp-ch/local-engine/tests/json/gtest_local_engine_config.json index 10f0ea3dfdad9..8ada07819bb6c 100644 --- a/cpp-ch/local-engine/tests/json/gtest_local_engine_config.json +++ b/cpp-ch/local-engine/tests/json/gtest_local_engine_config.json @@ -260,6 +260,14 @@ "value": { "string": "false" } + }, + { + "key": { + "string": "spark.gluten.sql.columnar.backend.ch.runtime_config.use_local_format" + }, + "value": { + "string": "true" + } } ] } diff --git a/cpp/core/config/GlutenConfig.cc b/cpp/core/config/GlutenConfig.cc index fa04ecfa4e5c5..bc6ad1cbe859d 100644 --- a/cpp/core/config/GlutenConfig.cc +++ b/cpp/core/config/GlutenConfig.cc @@ -15,13 +15,26 @@ * limitations under the License. */ +#include #include - +#include #include "compute/ProtobufUtils.h" #include "config.pb.h" #include "jni/JniError.h" +namespace { + +std::optional getRedactionRegex(const std::unordered_map& conf) { + auto it = conf.find(gluten::kSparkRedactionRegex); + if (it != conf.end()) { + return boost::regex(it->second); + } + return std::nullopt; +} +} // namespace + namespace gluten { + std::unordered_map parseConfMap(JNIEnv* env, const uint8_t* planData, const int32_t planDataLength) { std::unordered_map sparkConfs; @@ -37,9 +50,17 @@ parseConfMap(JNIEnv* env, const uint8_t* planData, const int32_t planDataLength) std::string printConfig(const std::unordered_map& conf) { std::ostringstream oss; oss << std::endl; - for (auto& [k, v] : conf) { - oss << " [" << k << ", " << v << "]\n"; + + auto redactionRegex = getRedactionRegex(conf); + + for (const auto& [k, v] : conf) { + if (redactionRegex && boost::regex_match(k, *redactionRegex)) { + oss << " [" << k << ", " << kSparkRedactionString << "]\n"; + } else { + oss << " [" << k << ", " << v << "]\n"; + } } return oss.str(); } + } // namespace gluten diff --git a/cpp/core/config/GlutenConfig.h b/cpp/core/config/GlutenConfig.h index e4f5a884b9200..31318ff0aa0c1 100644 --- a/cpp/core/config/GlutenConfig.h +++ b/cpp/core/config/GlutenConfig.h @@ -57,7 +57,6 @@ const std::string kGzipWindowSize4k = "4096"; const std::string kParquetCompressionCodec = "spark.sql.parquet.compression.codec"; const std::string kColumnarToRowMemoryThreshold = "spark.gluten.sql.columnarToRowMemoryThreshold"; -const std::string kColumnarToRowMemoryDefaultThreshold = "67108864"; // 64MB const std::string kUGIUserName = "spark.gluten.ugi.username"; const std::string kUGITokens = "spark.gluten.ugi.tokens"; @@ -67,6 +66,9 @@ const std::string kShuffleCompressionCodecBackend = "spark.gluten.sql.columnar.s const std::string kQatBackendName = "qat"; const std::string kIaaBackendName = "iaa"; +const std::string kSparkRedactionRegex = "spark.redaction.regex"; +const std::string kSparkRedactionString = "*********(redacted)"; + std::unordered_map parseConfMap(JNIEnv* env, const uint8_t* planData, const int32_t planDataLength); diff --git a/cpp/core/jni/JniCommon.cc b/cpp/core/jni/JniCommon.cc index 759a9d121f911..0d2b5d874dd2e 100644 --- a/cpp/core/jni/JniCommon.cc +++ b/cpp/core/jni/JniCommon.cc @@ -38,7 +38,7 @@ jmethodID gluten::JniCommonState::runtimeAwareCtxHandle() { } void gluten::JniCommonState::initialize(JNIEnv* env) { - runtimeAwareClass_ = createGlobalClassReference(env, "Lorg/apache/gluten/exec/RuntimeAware;"); + runtimeAwareClass_ = createGlobalClassReference(env, "Lorg/apache/gluten/runtime/RuntimeAware;"); runtimeAwareCtxHandle_ = getMethodIdOrError(env, runtimeAwareClass_, "handle", "()J"); JavaVM* vm; if (env->GetJavaVM(&vm) != JNI_OK) { diff --git a/cpp/core/jni/JniWrapper.cc b/cpp/core/jni/JniWrapper.cc index 60f367fd72d1a..4be5e9142818f 100644 --- a/cpp/core/jni/JniWrapper.cc +++ b/cpp/core/jni/JniWrapper.cc @@ -165,7 +165,7 @@ jint JNI_OnLoad(JavaVM* vm, void* reserved) { jniByteInputStreamClose = getMethodIdOrError(env, jniByteInputStreamClass, "close", "()V"); splitResultClass = createGlobalClassReferenceOrError(env, "Lorg/apache/gluten/vectorized/GlutenSplitResult;"); - splitResultConstructor = getMethodIdOrError(env, splitResultClass, "", "(JJJJJJJJJ[J[J)V"); + splitResultConstructor = getMethodIdOrError(env, splitResultClass, "", "(JJJJJJJJJJ[J[J)V"); columnarBatchSerializeResultClass = createGlobalClassReferenceOrError(env, "Lorg/apache/gluten/vectorized/ColumnarBatchSerializeResult;"); @@ -222,7 +222,7 @@ namespace { const std::string kBacktraceAllocation = "spark.gluten.memory.backtrace.allocation"; } -JNIEXPORT jlong JNICALL Java_org_apache_gluten_exec_RuntimeJniWrapper_createRuntime( // NOLINT +JNIEXPORT jlong JNICALL Java_org_apache_gluten_runtime_RuntimeJniWrapper_createRuntime( // NOLINT JNIEnv* env, jclass, jstring jbackendType, @@ -249,7 +249,7 @@ JNIEXPORT jlong JNICALL Java_org_apache_gluten_exec_RuntimeJniWrapper_createRunt JNI_METHOD_END(kInvalidObjectHandle) } -JNIEXPORT jbyteArray JNICALL Java_org_apache_gluten_exec_RuntimeJniWrapper_collectMemoryUsage( // NOLINT +JNIEXPORT jbyteArray JNICALL Java_org_apache_gluten_runtime_RuntimeJniWrapper_collectMemoryUsage( // NOLINT JNIEnv* env, jclass, jlong ctxHandle) { @@ -268,7 +268,7 @@ JNIEXPORT jbyteArray JNICALL Java_org_apache_gluten_exec_RuntimeJniWrapper_colle JNI_METHOD_END(nullptr) } -JNIEXPORT jlong JNICALL Java_org_apache_gluten_exec_RuntimeJniWrapper_shrinkMemory( // NOLINT +JNIEXPORT jlong JNICALL Java_org_apache_gluten_runtime_RuntimeJniWrapper_shrinkMemory( // NOLINT JNIEnv* env, jclass, jlong ctxHandle, @@ -279,7 +279,7 @@ JNIEXPORT jlong JNICALL Java_org_apache_gluten_exec_RuntimeJniWrapper_shrinkMemo JNI_METHOD_END(kInvalidObjectHandle) } -JNIEXPORT void JNICALL Java_org_apache_gluten_exec_RuntimeJniWrapper_holdMemory( // NOLINT +JNIEXPORT void JNICALL Java_org_apache_gluten_runtime_RuntimeJniWrapper_holdMemory( // NOLINT JNIEnv* env, jclass, jlong ctxHandle) { @@ -289,7 +289,7 @@ JNIEXPORT void JNICALL Java_org_apache_gluten_exec_RuntimeJniWrapper_holdMemory( JNI_METHOD_END() } -JNIEXPORT void JNICALL Java_org_apache_gluten_exec_RuntimeJniWrapper_releaseRuntime( // NOLINT +JNIEXPORT void JNICALL Java_org_apache_gluten_runtime_RuntimeJniWrapper_releaseRuntime( // NOLINT JNIEnv* env, jclass, jlong ctxHandle) { @@ -534,19 +534,8 @@ Java_org_apache_gluten_vectorized_NativeColumnarToRowJniWrapper_nativeColumnarTo auto& conf = ctx->getConfMap(); int64_t column2RowMemThreshold; auto it = conf.find(kColumnarToRowMemoryThreshold); - bool confIsLegal = - ((it == conf.end()) ? false : std::all_of(it->second.begin(), it->second.end(), [](unsigned char c) { - return std::isdigit(c); - })); - if (confIsLegal) { - column2RowMemThreshold = std::stoll(it->second); - } else { - LOG(INFO) - << "Because the spark.gluten.sql.columnarToRowMemoryThreshold configuration item is invalid, the kColumnarToRowMemoryDefaultThreshold default value is used, which is " - << kColumnarToRowMemoryDefaultThreshold << " byte"; - column2RowMemThreshold = std::stoll(kColumnarToRowMemoryDefaultThreshold); - } - + GLUTEN_CHECK(!(it == conf.end()), "Required key not found in runtime config: " + kColumnarToRowMemoryThreshold); + column2RowMemThreshold = std::stoll(it->second); // Convert the native batch to Spark unsafe row. return ctx->saveObject(ctx->createColumnar2RowConverter(column2RowMemThreshold)); JNI_METHOD_END(kInvalidObjectHandle) @@ -975,6 +964,7 @@ JNIEXPORT jobject JNICALL Java_org_apache_gluten_vectorized_ShuffleWriterJniWrap shuffleWriter->totalC2RTime(), shuffleWriter->totalBytesWritten(), shuffleWriter->totalBytesEvicted(), + shuffleWriter->totalBytesToEvict(), shuffleWriter->peakBytesAllocated(), partitionLengthArr, rawPartitionLengthArr); diff --git a/cpp/core/shuffle/LocalPartitionWriter.cc b/cpp/core/shuffle/LocalPartitionWriter.cc index fc5d758f8c8b2..fe206b488cf8f 100644 --- a/cpp/core/shuffle/LocalPartitionWriter.cc +++ b/cpp/core/shuffle/LocalPartitionWriter.cc @@ -183,12 +183,15 @@ class LocalPartitionWriter::PayloadMerger { return merged; } - arrow::Result>> finishForSpill(uint32_t partitionId) { + arrow::Result>> finishForSpill( + uint32_t partitionId, + int64_t& totalBytesToEvict) { // We need to check whether the spill source is from compressing/copying the merged buffers. if ((partitionInMerge_.has_value() && *partitionInMerge_ == partitionId) || !hasMerged(partitionId)) { return std::nullopt; } auto payload = std::move(partitionMergePayload_[partitionId]); + totalBytesToEvict += payload->rawSize(); return payload->toBlockPayload(Payload::kUncompressed, pool_, codec_); } @@ -312,7 +315,8 @@ class LocalPartitionWriter::PayloadCache { std::shared_ptr os, const std::string& spillFile, arrow::MemoryPool* pool, - arrow::util::Codec* codec) { + arrow::util::Codec* codec, + int64_t& totalBytesToEvict) { std::shared_ptr diskSpill = nullptr; ARROW_ASSIGN_OR_RAISE(auto start, os->Tell()); for (uint32_t pid = 0; pid < numPartitions_; ++pid) { @@ -321,6 +325,7 @@ class LocalPartitionWriter::PayloadCache { while (!payloads.empty()) { auto payload = std::move(payloads.front()); payloads.pop_front(); + totalBytesToEvict += payload->rawSize(); // Spill the cached payload to disk. RETURN_NOT_OK(payload->serialize(os.get())); compressTime_ += payload->getCompressTime(); @@ -550,7 +555,7 @@ arrow::Status LocalPartitionWriter::evict( bool reuseBuffers, bool hasComplexType, bool isFinal) { - rawPartitionLengths_[partitionId] += inMemoryPayload->getBufferSize(); + rawPartitionLengths_[partitionId] += inMemoryPayload->rawSize(); if (evictType == Evict::kSortSpill) { if (lastEvictPid_ != -1 && (partitionId < lastEvictPid_ || (isFinal && !dataFileOs_))) { @@ -604,16 +609,29 @@ arrow::Status LocalPartitionWriter::evict( return arrow::Status::OK(); } +// FIXME: Remove this code path for local partition writer. arrow::Status LocalPartitionWriter::evict(uint32_t partitionId, std::unique_ptr blockPayload, bool stop) { rawPartitionLengths_[partitionId] += blockPayload->rawSize(); if (lastEvictPid_ != -1 && partitionId < lastEvictPid_) { RETURN_NOT_OK(finishSpill(true)); + lastEvictPid_ = -1; } - lastEvictPid_ = partitionId; - RETURN_NOT_OK(requestSpill(stop)); - RETURN_NOT_OK(spiller_->spill(partitionId, std::move(blockPayload))); + + if (!stop) { + RETURN_NOT_OK(spiller_->spill(partitionId, std::move(blockPayload))); + } else { + if (spills_.size() > 0) { + for (auto pid = lastEvictPid_ + 1; pid <= partitionId; ++pid) { + auto bytesEvicted = totalBytesEvicted_; + RETURN_NOT_OK(mergeSpills(pid)); + partitionLengths_[pid] = totalBytesEvicted_ - bytesEvicted; + } + } + RETURN_NOT_OK(spiller_->spill(partitionId, std::move(blockPayload))); + } + lastEvictPid_ = partitionId; return arrow::Status::OK(); } @@ -630,7 +648,8 @@ arrow::Status LocalPartitionWriter::reclaimFixedSize(int64_t size, int64_t* actu ARROW_ASSIGN_OR_RAISE(auto os, arrow::io::BufferedOutputStream::Create(16384, pool_, raw)); spills_.emplace_back(); ARROW_ASSIGN_OR_RAISE( - spills_.back(), payloadCache_->spillAndClose(os, spillFile, payloadPool_.get(), codec_.get())); + spills_.back(), + payloadCache_->spillAndClose(os, spillFile, payloadPool_.get(), codec_.get(), totalBytesToEvict_)); reclaimed += beforeSpill - payloadPool_->bytes_allocated(); if (reclaimed >= size) { *actual = reclaimed; @@ -641,7 +660,7 @@ arrow::Status LocalPartitionWriter::reclaimFixedSize(int64_t size, int64_t* actu if (merger_) { auto beforeSpill = payloadPool_->bytes_allocated(); for (auto pid = 0; pid < numPartitions_; ++pid) { - ARROW_ASSIGN_OR_RAISE(auto merged, merger_->finishForSpill(pid)); + ARROW_ASSIGN_OR_RAISE(auto merged, merger_->finishForSpill(pid, totalBytesToEvict_)); if (merged.has_value()) { RETURN_NOT_OK(requestSpill(false)); RETURN_NOT_OK(spiller_->spill(pid, std::move(*merged))); @@ -666,6 +685,7 @@ arrow::Status LocalPartitionWriter::populateMetrics(ShuffleWriterMetrics* metric metrics->totalCompressTime += compressTime_; metrics->totalEvictTime += spillTime_; metrics->totalWriteTime += writeTime_; + metrics->totalBytesToEvict += totalBytesToEvict_; metrics->totalBytesEvicted += totalBytesEvicted_; metrics->totalBytesWritten += std::filesystem::file_size(dataFile_); metrics->partitionLengths = std::move(partitionLengths_); diff --git a/cpp/core/shuffle/LocalPartitionWriter.h b/cpp/core/shuffle/LocalPartitionWriter.h index efd7b4df3f4f0..555632fedd5dd 100644 --- a/cpp/core/shuffle/LocalPartitionWriter.h +++ b/cpp/core/shuffle/LocalPartitionWriter.h @@ -110,6 +110,7 @@ class LocalPartitionWriter : public PartitionWriter { std::vector subDirSelection_; std::shared_ptr dataFileOs_; + int64_t totalBytesToEvict_{0}; int64_t totalBytesEvicted_{0}; std::vector partitionLengths_; std::vector rawPartitionLengths_; diff --git a/cpp/core/shuffle/Options.h b/cpp/core/shuffle/Options.h index 11fa037eb5a6f..2424ec557742b 100644 --- a/cpp/core/shuffle/Options.h +++ b/cpp/core/shuffle/Options.h @@ -87,6 +87,7 @@ struct PartitionWriterOptions { struct ShuffleWriterMetrics { int64_t totalBytesWritten{0}; int64_t totalBytesEvicted{0}; + int64_t totalBytesToEvict{0}; int64_t totalWriteTime{0}; int64_t totalEvictTime{0}; int64_t totalCompressTime{0}; diff --git a/cpp/core/shuffle/Payload.cc b/cpp/core/shuffle/Payload.cc index daeef24ce5a6d..d0c24e4bcabaf 100644 --- a/cpp/core/shuffle/Payload.cc +++ b/cpp/core/shuffle/Payload.cc @@ -293,7 +293,6 @@ arrow::Result> BlockPayload::readBufferAt(uint32_ arrow::Result>> BlockPayload::deserialize( arrow::io::InputStream* inputStream, - const std::shared_ptr& schema, const std::shared_ptr& codec, arrow::MemoryPool* pool, uint32_t& numRows, @@ -326,9 +325,8 @@ void BlockPayload::setCompressionTime(int64_t compressionTime) { compressTime_ = compressionTime; } -uint64_t BlockPayload::rawSize() { - return std::accumulate( - buffers_.begin(), buffers_.end(), 0UL, [](auto sum, const auto& buffer) { return sum + buffer->size(); }); +int64_t BlockPayload::rawSize() { + return getBufferSize(buffers_); } arrow::Result> InMemoryPayload::merge( @@ -419,10 +417,6 @@ arrow::Result> InMemoryPayload::readBufferAt(uint return std::move(buffers_[index]); } -int64_t InMemoryPayload::getBufferSize() const { - return gluten::getBufferSize(buffers_); -} - arrow::Status InMemoryPayload::copyBuffers(arrow::MemoryPool* pool) { for (auto& buffer : buffers_) { if (!buffer) { @@ -439,9 +433,8 @@ arrow::Status InMemoryPayload::copyBuffers(arrow::MemoryPool* pool) { return arrow::Status::OK(); } -uint64_t InMemoryPayload::rawSize() { - return std::accumulate( - buffers_.begin(), buffers_.end(), 0UL, [](auto sum, const auto& buffer) { return sum + buffer->size(); }); +int64_t InMemoryPayload::rawSize() { + return getBufferSize(buffers_); } UncompressedDiskBlockPayload::UncompressedDiskBlockPayload( @@ -514,7 +507,7 @@ arrow::Result> UncompressedDiskBlockPayload::read return buffer; } -uint64_t UncompressedDiskBlockPayload::rawSize() { +int64_t UncompressedDiskBlockPayload::rawSize() { return rawSize_; } @@ -522,7 +515,7 @@ CompressedDiskBlockPayload::CompressedDiskBlockPayload( uint32_t numRows, const std::vector* isValidityBuffer, arrow::io::InputStream*& inputStream, - uint64_t rawSize, + int64_t rawSize, arrow::MemoryPool* /* pool */) : Payload(Type::kCompressed, numRows, isValidityBuffer), inputStream_(inputStream), rawSize_(rawSize) {} @@ -537,7 +530,7 @@ arrow::Result> CompressedDiskBlockPayload::readBu return arrow::Status::Invalid("Cannot read buffer from CompressedDiskBlockPayload."); } -uint64_t CompressedDiskBlockPayload::rawSize() { +int64_t CompressedDiskBlockPayload::rawSize() { return rawSize_; } } // namespace gluten diff --git a/cpp/core/shuffle/Payload.h b/cpp/core/shuffle/Payload.h index 4c53065a6ed94..1bd8815a4c2a7 100644 --- a/cpp/core/shuffle/Payload.h +++ b/cpp/core/shuffle/Payload.h @@ -38,7 +38,7 @@ class Payload { virtual arrow::Result> readBufferAt(uint32_t index) = 0; - virtual uint64_t rawSize() = 0; + virtual int64_t rawSize() = 0; int64_t getCompressTime() const { return compressTime_; @@ -88,7 +88,6 @@ class BlockPayload final : public Payload { static arrow::Result>> deserialize( arrow::io::InputStream* inputStream, - const std::shared_ptr& schema, const std::shared_ptr& codec, arrow::MemoryPool* pool, uint32_t& numRows, @@ -98,7 +97,7 @@ class BlockPayload final : public Payload { arrow::Result> readBufferAt(uint32_t pos) override; - uint64_t rawSize() override; + int64_t rawSize() override; protected: BlockPayload( @@ -135,11 +134,9 @@ class InMemoryPayload final : public Payload { arrow::Result> toBlockPayload(Payload::Type payloadType, arrow::MemoryPool* pool, arrow::util::Codec* codec); - int64_t getBufferSize() const; - arrow::Status copyBuffers(arrow::MemoryPool* pool); - uint64_t rawSize() override; + int64_t rawSize() override; private: std::vector> buffers_; @@ -160,11 +157,11 @@ class UncompressedDiskBlockPayload final : public Payload { arrow::Status serialize(arrow::io::OutputStream* outputStream) override; - uint64_t rawSize() override; + int64_t rawSize() override; private: arrow::io::InputStream*& inputStream_; - uint64_t rawSize_; + int64_t rawSize_; arrow::MemoryPool* pool_; arrow::util::Codec* codec_; uint32_t readPos_{0}; @@ -178,17 +175,17 @@ class CompressedDiskBlockPayload final : public Payload { uint32_t numRows, const std::vector* isValidityBuffer, arrow::io::InputStream*& inputStream, - uint64_t rawSize, + int64_t rawSize, arrow::MemoryPool* pool); arrow::Status serialize(arrow::io::OutputStream* outputStream) override; arrow::Result> readBufferAt(uint32_t index) override; - uint64_t rawSize() override; + int64_t rawSize() override; private: arrow::io::InputStream*& inputStream_; - uint64_t rawSize_; + int64_t rawSize_; }; } // namespace gluten diff --git a/cpp/core/shuffle/ShuffleWriter.cc b/cpp/core/shuffle/ShuffleWriter.cc index e637d37ffdd83..3eff9a2c821ef 100644 --- a/cpp/core/shuffle/ShuffleWriter.cc +++ b/cpp/core/shuffle/ShuffleWriter.cc @@ -55,6 +55,10 @@ int64_t ShuffleWriter::totalBytesEvicted() const { return metrics_.totalBytesEvicted; } +int64_t ShuffleWriter::totalBytesToEvict() const { + return metrics_.totalBytesToEvict; +} + int64_t ShuffleWriter::totalWriteTime() const { return metrics_.totalWriteTime; } diff --git a/cpp/core/shuffle/ShuffleWriter.h b/cpp/core/shuffle/ShuffleWriter.h index 661112150297c..8c79829e00a4d 100644 --- a/cpp/core/shuffle/ShuffleWriter.h +++ b/cpp/core/shuffle/ShuffleWriter.h @@ -52,6 +52,8 @@ class ShuffleWriter : public Reclaimable { int64_t totalBytesEvicted() const; + int64_t totalBytesToEvict() const; + int64_t totalWriteTime() const; int64_t totalEvictTime() const; diff --git a/cpp/core/shuffle/Spill.cc b/cpp/core/shuffle/Spill.cc index 0bbe667ab4d87..d8b9bc7ebf997 100644 --- a/cpp/core/shuffle/Spill.cc +++ b/cpp/core/shuffle/Spill.cc @@ -48,7 +48,7 @@ void Spill::insertPayload( Payload::Type payloadType, uint32_t numRows, const std::vector* isValidityBuffer, - uint64_t rawSize, + int64_t rawSize, arrow::MemoryPool* pool, arrow::util::Codec* codec) { // TODO: Add compression threshold. diff --git a/cpp/core/shuffle/Spill.h b/cpp/core/shuffle/Spill.h index 7ee60ef299fe5..9d8d240879f90 100644 --- a/cpp/core/shuffle/Spill.h +++ b/cpp/core/shuffle/Spill.h @@ -46,7 +46,7 @@ class Spill final { Payload::Type payloadType, uint32_t numRows, const std::vector* isValidityBuffer, - uint64_t rawSize, + int64_t rawSize, arrow::MemoryPool* pool, arrow::util::Codec* codec); diff --git a/cpp/core/shuffle/rss/RssPartitionWriter.cc b/cpp/core/shuffle/rss/RssPartitionWriter.cc index 19f178a2cbc79..8f75f999335fe 100644 --- a/cpp/core/shuffle/rss/RssPartitionWriter.cc +++ b/cpp/core/shuffle/rss/RssPartitionWriter.cc @@ -56,7 +56,7 @@ arrow::Status RssPartitionWriter::evict( bool reuseBuffers, bool hasComplexType, bool isFinal) { - rawPartitionLengths_[partitionId] += inMemoryPayload->getBufferSize(); + rawPartitionLengths_[partitionId] += inMemoryPayload->rawSize(); auto payloadType = codec_ ? Payload::Type::kCompressed : Payload::Type::kUncompressed; ARROW_ASSIGN_OR_RAISE( auto payload, inMemoryPayload->toBlockPayload(payloadType, payloadPool_.get(), codec_ ? codec_.get() : nullptr)); diff --git a/cpp/velox/CMakeLists.txt b/cpp/velox/CMakeLists.txt index f1aa96277cb85..1952651338154 100644 --- a/cpp/velox/CMakeLists.txt +++ b/cpp/velox/CMakeLists.txt @@ -91,343 +91,24 @@ endif() set(VELOX_COMPONENTS_PATH "${VELOX_BUILD_PATH}/velox") -function(ADD_VELOX_DEPENDENCY VELOX_DEP_LIB_NAME VELOX_DEP_LIB_PATH) - if(NOT EXISTS ${VELOX_DEP_LIB_PATH}) - message(FATAL_ERROR "Velox library not exists: ${VELOX_DEP_LIB_PATH}") +function(import_library TARGET_NAME LIB_PATH) + if(NOT EXISTS ${LIB_PATH}) + message(FATAL_ERROR "Library does not exist: ${LIB_PATH}") endif() - set(VELOX_DEP_LIB facebook::velox::${VELOX_DEP_LIB_NAME}) - add_library(${VELOX_DEP_LIB} STATIC IMPORTED) - set_target_properties(${VELOX_DEP_LIB} PROPERTIES IMPORTED_LOCATION - ${VELOX_DEP_LIB_PATH}) - target_link_libraries(velox PUBLIC ${VELOX_DEP_LIB}) + add_library(${TARGET_NAME} STATIC IMPORTED) + set_target_properties(${TARGET_NAME} PROPERTIES IMPORTED_LOCATION ${LIB_PATH}) endfunction() -macro(ADD_VELOX_OBJECTS) - add_library(velox_objects OBJECT IMPORTED GLOBAL) - set_property( - TARGET velox_objects - PROPERTY - IMPORTED_OBJECTS - "${VELOX_COMPONENTS_PATH}/connectors/hive/CMakeFiles/velox_hive_connector.dir/FileHandle.cpp.o" - "${VELOX_COMPONENTS_PATH}/connectors/hive/CMakeFiles/velox_hive_connector.dir/HiveConfig.cpp.o" - "${VELOX_COMPONENTS_PATH}/connectors/hive/CMakeFiles/velox_hive_connector.dir/HiveConnector.cpp.o" - "${VELOX_COMPONENTS_PATH}/connectors/hive/CMakeFiles/velox_hive_connector.dir/HiveDataSink.cpp.o" - "${VELOX_COMPONENTS_PATH}/connectors/hive/CMakeFiles/velox_hive_connector.dir/HiveDataSource.cpp.o" - "${VELOX_COMPONENTS_PATH}/connectors/hive/CMakeFiles/velox_hive_connector.dir/HivePartitionUtil.cpp.o" - "${VELOX_COMPONENTS_PATH}/connectors/hive/CMakeFiles/velox_hive_connector.dir/PartitionIdGenerator.cpp.o" - "${VELOX_COMPONENTS_PATH}/connectors/hive/CMakeFiles/velox_hive_connector.dir/SplitReader.cpp.o" - "${VELOX_COMPONENTS_PATH}/connectors/hive/CMakeFiles/velox_hive_connector.dir/TableHandle.cpp.o" - "${VELOX_COMPONENTS_PATH}/connectors/hive/CMakeFiles/velox_hive_connector.dir/HiveConnectorUtil.cpp.o" - ) - target_link_libraries(velox PUBLIC velox_objects) -endmacro() - macro(add_duckdb) find_package(DuckDB) if(NOT DuckDB_FOUND) message(FATAL_ERROR "Cannot find DuckDB.") else() message(STATUS "Found DuckDB library from ${DuckDB_DIR}") - target_link_libraries(velox PUBLIC duckdb_static) + target_link_libraries(facebook::velox INTERFACE duckdb_static) endif() endmacro() -macro(ADD_VELOX_DEPENDENCIES) - add_velox_objects() - add_velox_dependency( - expression::sigparser - "${VELOX_COMPONENTS_PATH}/expression/signature_parser/libvelox_signature_parser.a" - ) - add_velox_dependency( - functions::sparksql::lib - "${VELOX_COMPONENTS_PATH}/functions/sparksql/libvelox_functions_spark.a") - add_velox_dependency( - functions::sparksql::agg - "${VELOX_COMPONENTS_PATH}/functions/sparksql/aggregates/libvelox_functions_spark_aggregates.a" - ) - add_velox_dependency( - functions::window::sparksql - "${VELOX_COMPONENTS_PATH}/functions/sparksql/window/libvelox_functions_spark_window.a" - ) - add_velox_dependency( - functions::prestosql::agg - "${VELOX_COMPONENTS_PATH}/functions/prestosql/aggregates/libvelox_aggregates.a" - ) - add_velox_dependency( - functions::lib::agg - "${VELOX_COMPONENTS_PATH}/functions/lib/aggregates/libvelox_functions_aggregates.a" - ) - add_velox_dependency( - functions::prestosql::window - "${VELOX_COMPONENTS_PATH}/functions/prestosql/window/libvelox_window.a") - add_velox_dependency( - functions::lib::window - "${VELOX_COMPONENTS_PATH}/functions/lib/window/libvelox_functions_window.a") - add_velox_dependency(velox::buffer - "${VELOX_COMPONENTS_PATH}/buffer/libvelox_buffer.a") - - add_velox_dependency( - functions::isnull - "${VELOX_COMPONENTS_PATH}/functions/lib/libvelox_is_null_functions.a") - add_velox_dependency( - functions::prestosql - "${VELOX_COMPONENTS_PATH}/functions/prestosql/registration/libvelox_functions_prestosql.a" - ) - add_velox_dependency( - functions::prestosql::impl - "${VELOX_COMPONENTS_PATH}/functions/prestosql/libvelox_functions_prestosql_impl.a" - ) - add_velox_dependency( - functions::json - "${VELOX_COMPONENTS_PATH}/functions/prestosql/json/libvelox_functions_json.a" - ) - add_velox_dependency( - functions::hyperloglog - "${VELOX_COMPONENTS_PATH}/common/hyperloglog/libvelox_common_hyperloglog.a") - add_velox_dependency( - functions::lib - "${VELOX_COMPONENTS_PATH}/functions/lib/libvelox_functions_lib.a") - add_velox_dependency( - functions::lib::date_time_formatter - "${VELOX_COMPONENTS_PATH}/functions/lib/libvelox_functions_lib_date_time_formatter.a" - ) - if(BUILD_TESTS) - add_velox_dependency( - exec::test - "${VELOX_COMPONENTS_PATH}/exec/tests/utils/libvelox_exec_test_lib.a") - add_velox_dependency( - temp::path - "${VELOX_COMPONENTS_PATH}/exec/tests/utils/libvelox_temp_path.a") - add_velox_dependency( - dwio::common::test::utils - "${VELOX_COMPONENTS_PATH}/dwio/common/tests/utils/libvelox_dwio_common_test_utils.a" - ) - endif() - add_velox_dependency(exec "${VELOX_COMPONENTS_PATH}/exec/libvelox_exec.a") - - if(BUILD_TESTS) - add_velox_dependency( - parse::parser "${VELOX_COMPONENTS_PATH}/parse/libvelox_parse_parser.a") - add_velox_dependency( - duckdb::parser - "${VELOX_COMPONENTS_PATH}/duckdb/conversion/libvelox_duckdb_parser.a") - add_velox_dependency( - parse::expression - "${VELOX_COMPONENTS_PATH}/parse/libvelox_parse_expression.a") - add_velox_dependency( - parse::utils "${VELOX_COMPONENTS_PATH}/parse/libvelox_parse_utils.a") - add_velox_dependency( - function::registry - "${VELOX_COMPONENTS_PATH}/functions/libvelox_function_registry.a") - endif() - add_velox_dependency( - vector::arrow::bridge - "${VELOX_COMPONENTS_PATH}/vector/arrow/libvelox_arrow_bridge.a") - add_velox_dependency(row "${VELOX_COMPONENTS_PATH}/row/libvelox_row_fast.a") - add_velox_dependency( - connector "${VELOX_COMPONENTS_PATH}/connectors/libvelox_connector.a") - add_velox_dependency( - connector::hive_parition - "${VELOX_COMPONENTS_PATH}/connectors/hive/libvelox_hive_partition_function.a" - ) - add_velox_dependency( - connector::hive::iceberg::IcebergSplitReader - "${VELOX_COMPONENTS_PATH}/connectors/hive/iceberg/libvelox_hive_iceberg_splitreader.a" - ) - add_velox_dependency( - connector::hive::hdfs - "${VELOX_COMPONENTS_PATH}/connectors/hive/storage_adapters/hdfs/libvelox_hdfs.a" - ) - add_velox_dependency( - connector::hive::gcs - "${VELOX_COMPONENTS_PATH}/connectors/hive/storage_adapters/gcs/libvelox_gcs.a" - ) - add_velox_dependency( - connector::hive::s3fs - "${VELOX_COMPONENTS_PATH}/connectors/hive/storage_adapters/s3fs/libvelox_s3fs.a" - ) - add_velox_dependency( - connector::hive::abfs - "${VELOX_COMPONENTS_PATH}/connectors/hive/storage_adapters/abfs/libvelox_abfs.a" - ) - add_velox_dependency( - dwio::dwrf::writer - "${VELOX_COMPONENTS_PATH}/dwio/dwrf/writer/libvelox_dwio_dwrf_writer.a") - add_velox_dependency( - dwio::dwrf::reader - "${VELOX_COMPONENTS_PATH}/dwio/dwrf/reader/libvelox_dwio_dwrf_reader.a") - add_velox_dependency( - dwio::orc::reader - "${VELOX_COMPONENTS_PATH}/dwio/orc/reader/libvelox_dwio_orc_reader.a") - add_velox_dependency( - dwio::dwrf::utils - "${VELOX_COMPONENTS_PATH}/dwio/dwrf/utils/libvelox_dwio_dwrf_utils.a") - add_velox_dependency( - dwio::dwrf::common - "${VELOX_COMPONENTS_PATH}/dwio/dwrf/common/libvelox_dwio_dwrf_common.a") - add_velox_dependency( - parquet - "${VELOX_COMPONENTS_PATH}/dwio/parquet/libvelox_dwio_parquet_reader.a") - add_velox_dependency( - parquet::reader::native - "${VELOX_COMPONENTS_PATH}/dwio/parquet/reader/libvelox_dwio_native_parquet_reader.a" - ) - if(BUILD_TESTS) - add_velox_dependency( - dwio::common::utils - "${VELOX_COMPONENTS_PATH}/dwio/common/tests/utils/libvelox_dwio_common_test_utils.a" - ) - add_velox_dependency( - dwio::dwrf::test_utils - "${VELOX_COMPONENTS_PATH}/dwio/dwrf/test/utils/libvelox_dwrf_test_utils.a" - ) - add_velox_dependency( - parquet::reader::duckdb_conversion - "${VELOX_COMPONENTS_PATH}/duckdb/conversion/libvelox_duckdb_conversion.a") - - add_duckdb() - - add_velox_dependency( - tpch::gen "${VELOX_COMPONENTS_PATH}/tpch/gen/libvelox_tpch_gen.a") - add_velox_dependency(dbgen - "${VELOX_COMPONENTS_PATH}/tpch/gen/dbgen/libdbgen.a") - endif() - - add_velox_dependency( - parquet::reader::thrift - "${VELOX_COMPONENTS_PATH}/dwio/parquet/thrift/libvelox_dwio_parquet_thrift.a" - ) - - add_velox_dependency( - velox::arrow::parquet::writer - "${VELOX_COMPONENTS_PATH}/dwio/parquet/writer/libvelox_dwio_arrow_parquet_writer.a" - ) - add_velox_dependency( - dwio::arrow::parquet::writer - "${VELOX_COMPONENTS_PATH}/dwio/parquet/writer/arrow/libvelox_dwio_arrow_parquet_writer_lib.a" - ) - add_velox_dependency( - dwio::arrow::parquet::writer::util - "${VELOX_COMPONENTS_PATH}/dwio/parquet/writer/arrow/util/libvelox_dwio_arrow_parquet_writer_util_lib.a" - ) - add_velox_dependency( - dwio::arrow::parquet::writer::thrift::lib - "${VELOX_COMPONENTS_PATH}/dwio/parquet/writer/arrow/generated/libvelox_dwio_arrow_parquet_writer_thrift_lib.a" - ) - add_velox_dependency( - dwio::common::compression - "${VELOX_COMPONENTS_PATH}/dwio/common/compression/libvelox_dwio_common_compression.a" - ) - add_velox_dependency( - dwio::common "${VELOX_COMPONENTS_PATH}/dwio/common/libvelox_dwio_common.a") - add_velox_dependency( - functions::prestosql::types - "${VELOX_COMPONENTS_PATH}/functions/prestosql/types/libvelox_presto_types.a" - ) - add_velox_dependency( - functions::spark::specialforms - "${VELOX_COMPONENTS_PATH}/functions/sparksql/specialforms/libvelox_functions_spark_specialforms.a" - ) - add_velox_dependency( - expression "${VELOX_COMPONENTS_PATH}/expression/libvelox_expression.a") - add_velox_dependency(core "${VELOX_COMPONENTS_PATH}/core/libvelox_core.a") - - add_velox_dependency( - type::fbhive "${VELOX_COMPONENTS_PATH}/type/fbhive/libvelox_type_fbhive.a") - add_velox_dependency(type "${VELOX_COMPONENTS_PATH}/type/libvelox_type.a") - add_velox_dependency( - vector::serializes - "${VELOX_COMPONENTS_PATH}/serializers/libvelox_presto_serializer.a") - add_velox_dependency( - functions::lib::util - "${VELOX_COMPONENTS_PATH}/functions/lib/libvelox_functions_util.a") - add_velox_dependency(vector - "${VELOX_COMPONENTS_PATH}/vector/libvelox_vector.a") - add_velox_dependency( - expression::function - "${VELOX_COMPONENTS_PATH}/expression/libvelox_expression_functions.a") - add_velox_dependency( - expression::type_calculation - "${VELOX_COMPONENTS_PATH}/expression/type_calculation/libvelox_type_calculation.a" - ) - - add_velox_dependency( - common::caching - "${VELOX_COMPONENTS_PATH}/common/caching/libvelox_caching.a") - add_velox_dependency( - common::base "${VELOX_COMPONENTS_PATH}/common/base/libvelox_common_base.a") - add_velox_dependency( - common::memory "${VELOX_COMPONENTS_PATH}/common/memory/libvelox_memory.a") - add_velox_dependency( - common::serialization - "${VELOX_COMPONENTS_PATH}/common/serialization/libvelox_serialization.a") - add_velox_dependency( - common::base::exception - "${VELOX_COMPONENTS_PATH}/common/base/libvelox_exception.a") - - add_velox_dependency(type::tz - "${VELOX_COMPONENTS_PATH}/type/tz/libvelox_type_tz.a") - add_velox_dependency( - dwio::dwrf::proto - "${VELOX_COMPONENTS_PATH}/dwio/dwrf/proto/libvelox_dwio_dwrf_proto.a") - add_velox_dependency( - dwio::catalog::fbhive - "${VELOX_COMPONENTS_PATH}/dwio/catalog/fbhive/libvelox_dwio_catalog_fbhive.a" - ) - add_velox_dependency( - dwio::common::exception - "${VELOX_COMPONENTS_PATH}/dwio/common/exception/libvelox_dwio_common_exception.a" - ) - add_velox_dependency( - dwio::common::encryption - "${VELOX_COMPONENTS_PATH}/dwio/common/encryption/libvelox_dwio_common_encryption.a" - ) - - add_velox_dependency(core::config - "${VELOX_COMPONENTS_PATH}/core/libvelox_config.a") - add_velox_dependency( - common::encode "${VELOX_COMPONENTS_PATH}/common/encode/libvelox_encode.a") - add_velox_dependency(common::time - "${VELOX_COMPONENTS_PATH}/common/time/libvelox_time.a") - if(BUILD_TESTS) - add_velox_dependency( - common::file::test - "${VELOX_COMPONENTS_PATH}/common/file/tests/libvelox_file_test_utils.a") - endif() - add_velox_dependency(common::file - "${VELOX_COMPONENTS_PATH}/common/file/libvelox_file.a") - add_velox_dependency( - common::process - "${VELOX_COMPONENTS_PATH}/common/process/libvelox_process.a") - - add_velox_dependency( - common::test_util - "${VELOX_COMPONENTS_PATH}/common/testutil/libvelox_test_util.a") - - add_velox_dependency( - external::md5 - "${VELOX_COMPONENTS_PATH}/external/md5/libvelox_external_md5.a") - add_velox_dependency( - external::date - "${VELOX_COMPONENTS_PATH}/external/date/libvelox_external_date.a") - add_velox_dependency( - velox::parquet::writer - "${VELOX_COMPONENTS_PATH}/dwio/parquet/libvelox_dwio_parquet_writer.a") - - if(BUILD_TESTS) - add_velox_dependency( - vector::test::util - "${VELOX_COMPONENTS_PATH}/vector/tests/utils/libvelox_vector_test_lib.a") - endif() - add_velox_dependency( - common::compression - "${VELOX_COMPONENTS_PATH}/common/compression/libvelox_common_compression.a") - add_velox_dependency( - common::io "${VELOX_COMPONENTS_PATH}/common/io/libvelox_common_io.a") - add_velox_dependency(velox::status - "${VELOX_COMPONENTS_PATH}/common/base/libvelox_status.a") -endmacro() - macro(find_libhdfs3) find_package(libhdfs3 CONFIG) if(libhdfs3_FOUND AND TARGET HDFS::hdfs3) @@ -582,20 +263,51 @@ if(BUILD_JEMALLOC) endif() target_link_libraries(velox PUBLIC gluten) -add_velox_dependencies() + +# Requires VELOX_MONO_LIBRARY=ON when building Velox. +import_library(facebook::velox ${VELOX_BUILD_PATH}/lib/libvelox.a) + +if(BUILD_TESTS) + add_duckdb() + + import_library(facebook::velox::dbgen + ${VELOX_BUILD_PATH}/velox/tpch/gen/dbgen/libdbgen.a) + target_link_libraries(facebook::velox INTERFACE facebook::velox::dbgen) + + import_library( + facebook::velox::vector_test_lib + ${VELOX_BUILD_PATH}/velox/vector/tests/utils/libvelox_vector_test_lib.a) + import_library( + facebook::velox::dwio_common_test + ${VELOX_BUILD_PATH}/velox/dwio/common/tests/utils/libvelox_dwio_common_test_utils.a + ) + import_library( + facebook::velox::file_test_utils + ${VELOX_BUILD_PATH}/velox/common/file/tests/libvelox_file_test_utils.a) + import_library( + facebook::velox::temp_path + ${VELOX_BUILD_PATH}/velox/exec/tests/utils/libvelox_temp_path.a) + import_library( + facebook::velox::exec_test_lib + ${VELOX_COMPONENTS_PATH}/exec/tests/utils/libvelox_exec_test_lib.a) + target_link_libraries( + facebook::velox::exec_test_lib + INTERFACE facebook::velox::vector_test_lib + facebook::velox::dwio_common_test + facebook::velox::file_test_utils facebook::velox::temp_path) + target_link_libraries(velox PUBLIC facebook::velox::exec_test_lib) +endif() + +target_link_libraries(velox PUBLIC facebook::velox) target_link_libraries(velox PUBLIC Folly::folly) + find_re2() target_link_libraries(velox PUBLIC ${RE2_LIBRARY}) -# since -# https://github.com/facebookincubator/velox/commit/47970417ac92135e862c0fde350d4d60fa2f1423 -if(Stemmer_FOUND) - target_link_libraries(velox PUBLIC stemmer::stemmer) -else() - add_velox_dependency( - velox "${VELOX_BUILD_PATH}/_deps/libstemmer/src/libstemmer/libstemmer.a") -endif() +import_library(external::stemmer + ${VELOX_BUILD_PATH}/_deps/libstemmer/src/libstemmer/libstemmer.a) +target_link_libraries(velox PUBLIC external::stemmer) set(CMAKE_FIND_LIBRARY_SUFFIXES_BCK ${CMAKE_FIND_LIBRARY_SUFFIXES}) set(CMAKE_FIND_LIBRARY_SUFFIXES ".a") @@ -603,8 +315,9 @@ find_package(simdjson CONFIG) if(simdjson_FOUND AND TARGET simdjson::simdjson) target_link_libraries(velox PUBLIC simdjson::simdjson) else() - add_velox_dependency(external::simdjson - "${VELOX_BUILD_PATH}/_deps/simdjson-build/libsimdjson.a") + import_library(external::simdjson + ${VELOX_BUILD_PATH}/_deps/simdjson-build/libsimdjson.a) + target_link_libraries(velox PUBLIC external::simdjson) endif() set(CMAKE_FIND_LIBRARY_SUFFIXES ${CMAKE_FIND_LIBRARY_SUFFIXES_BCK}) @@ -634,10 +347,6 @@ if(ENABLE_GCS) target_link_libraries(velox PUBLIC google-cloud-cpp::storage) endif() -if(BUILD_EXAMPLES) - add_subdirectory(udf/examples) -endif() - if(ENABLE_ABFS) add_definitions(-DENABLE_ABFS) find_azure() @@ -645,6 +354,10 @@ if(ENABLE_ABFS) target_link_libraries(velox PUBLIC Azure::azure-storage-files-datalake) endif() +if(BUILD_EXAMPLES) + add_subdirectory(udf/examples) +endif() + add_custom_command( TARGET velox POST_BUILD diff --git a/cpp/velox/benchmarks/GenericBenchmark.cc b/cpp/velox/benchmarks/GenericBenchmark.cc index a7776f9f1074b..4b46dbd599411 100644 --- a/cpp/velox/benchmarks/GenericBenchmark.cc +++ b/cpp/velox/benchmarks/GenericBenchmark.cc @@ -30,6 +30,7 @@ #include "compute/VeloxPlanConverter.h" #include "compute/VeloxRuntime.h" #include "config/GlutenConfig.h" +#include "config/VeloxConfig.h" #include "shuffle/LocalPartitionWriter.h" #include "shuffle/VeloxShuffleWriter.h" #include "shuffle/rss/RssPartitionWriter.h" @@ -44,22 +45,23 @@ using namespace gluten; namespace { +DEFINE_bool(run_example, false, "Run the example and exit."); DEFINE_bool(print_result, true, "Print result for execution"); DEFINE_string(save_output, "", "Path to parquet file for saving the task output iterator"); DEFINE_bool(with_shuffle, false, "Add shuffle split at end."); +DEFINE_bool(run_shuffle, false, "Only run shuffle write."); +DEFINE_bool(run_shuffle_read, false, "Whether to run shuffle read when run_shuffle is true."); +DEFINE_string(shuffle_writer, "hash", "Shuffle writer type. Can be hash or sort"); DEFINE_string( partitioning, "rr", "Short partitioning name. Valid options are rr, hash, range, single, random (only for test purpose)"); -DEFINE_string(shuffle_writer, "hash", "Shuffle writer type. Can be hash or sort"); DEFINE_bool(rss, false, "Mocking rss."); DEFINE_string( compression, "lz4", "Specify the compression codec. Valid options are lz4, zstd, qat_gzip, qat_zstd, iaa_gzip"); DEFINE_int32(shuffle_partitions, 200, "Number of shuffle split (reducer) partitions"); -DEFINE_bool(run_shuffle, false, "Only run shuffle write."); -DEFINE_bool(run_example, false, "Run the example and exit."); DEFINE_string(plan, "", "Path to input json file of the substrait plan."); DEFINE_string( @@ -76,15 +78,21 @@ DEFINE_string( "Scan mode for reading parquet data." "'stream' mode: Input file scan happens inside of the pipeline." "'buffered' mode: First read all data into memory and feed the pipeline with it."); +DEFINE_bool(debug_mode, false, "Whether to enable debug mode. Same as setting `spark.gluten.sql.debug`"); struct WriterMetrics { - int64_t splitTime; - int64_t evictTime; - int64_t writeTime; - int64_t compressTime; + int64_t splitTime{0}; + int64_t evictTime{0}; + int64_t writeTime{0}; + int64_t compressTime{0}; + + int64_t bytesSpilled{0}; + int64_t bytesWritten{0}; +}; - public: - explicit WriterMetrics() : splitTime(0), evictTime(0), writeTime(0), compressTime(0) {} +struct ReaderMetrics { + int64_t decompressTime{0}; + int64_t deserializeTime{0}; }; void setUpBenchmark(::benchmark::internal::Benchmark* bm) { @@ -98,9 +106,10 @@ void setUpBenchmark(::benchmark::internal::Benchmark* bm) { } } -std::shared_ptr -createShuffleWriter(Runtime* runtime, const std::string& dataFile, const std::vector& localDirs) { +PartitionWriterOptions createPartitionWriterOptions() { PartitionWriterOptions partitionWriterOptions{}; + // Disable writer's merge. + partitionWriterOptions.mergeThreshold = 0; // Configure compression. if (FLAGS_compression == "lz4") { @@ -121,27 +130,39 @@ createShuffleWriter(Runtime* runtime, const std::string& dataFile, const std::ve partitionWriterOptions.codecBackend = CodecBackend::IAA; partitionWriterOptions.compressionType = arrow::Compression::GZIP; } + return partitionWriterOptions; +} +std::unique_ptr createPartitionWriter( + Runtime* runtime, + PartitionWriterOptions options, + const std::string& dataFile, + const std::vector& localDirs) { std::unique_ptr partitionWriter; if (FLAGS_rss) { auto rssClient = std::make_unique(dataFile); partitionWriter = std::make_unique( FLAGS_shuffle_partitions, - std::move(partitionWriterOptions), + std::move(options), runtime->memoryManager()->getArrowMemoryPool(), std::move(rssClient)); } else { partitionWriter = std::make_unique( FLAGS_shuffle_partitions, - std::move(partitionWriterOptions), + std::move(options), runtime->memoryManager()->getArrowMemoryPool(), dataFile, localDirs); } + return partitionWriter; +} +std::shared_ptr createShuffleWriter( + Runtime* runtime, + std::unique_ptr partitionWriter) { auto options = ShuffleWriterOptions{}; options.partitioning = gluten::toPartitioning(FLAGS_partitioning); - if (FLAGS_rss) { + if (FLAGS_rss || FLAGS_shuffle_writer == "rss_sort") { options.shuffleWriterType = gluten::kRssSortShuffle; } else if (FLAGS_shuffle_writer == "sort") { options.shuffleWriterType = gluten::kSortShuffle; @@ -163,6 +184,8 @@ void populateWriterMetrics( if (splitTime > 0) { metrics.splitTime += splitTime; } + metrics.bytesWritten += shuffleWriter->totalBytesWritten(); + metrics.bytesSpilled += shuffleWriter->totalBytesEvicted(); } void setCpu(::benchmark::State& state) { @@ -171,7 +194,7 @@ void setCpu(::benchmark::State& state) { if (FLAGS_cpu != -1) { cpu += FLAGS_cpu; } - LOG(INFO) << "Setting CPU for thread " << state.thread_index() << " to " << cpu; + LOG(WARNING) << "Setting CPU for thread " << state.thread_index() << " to " << cpu; gluten::setCpu(cpu); } @@ -179,26 +202,56 @@ void runShuffle( Runtime* runtime, BenchmarkAllocationListener* listener, const std::shared_ptr& resultIter, - WriterMetrics& metrics) { + WriterMetrics& writerMetrics, + ReaderMetrics& readerMetrics, + bool readAfterWrite) { std::string dataFile; std::vector localDirs; bool isFromEnv; GLUTEN_THROW_NOT_OK(setLocalDirsAndDataFileFromEnv(dataFile, localDirs, isFromEnv)); - auto shuffleWriter = createShuffleWriter(runtime, dataFile, localDirs); + auto partitionWriterOptions = createPartitionWriterOptions(); + auto partitionWriter = createPartitionWriter(runtime, partitionWriterOptions, dataFile, localDirs); + auto shuffleWriter = createShuffleWriter(runtime, std::move(partitionWriter)); listener->setShuffleWriter(shuffleWriter.get()); int64_t totalTime = 0; + std::shared_ptr cSchema; { gluten::ScopedTimer timer(&totalTime); while (resultIter->hasNext()) { - GLUTEN_THROW_NOT_OK( - shuffleWriter->write(resultIter->next(), ShuffleWriter::kMaxMemLimit - shuffleWriter->cachedPayloadSize())); + auto cb = resultIter->next(); + if (!cSchema) { + cSchema = cb->exportArrowSchema(); + } + GLUTEN_THROW_NOT_OK(shuffleWriter->write(cb, ShuffleWriter::kMaxMemLimit - shuffleWriter->cachedPayloadSize())); } GLUTEN_THROW_NOT_OK(shuffleWriter->stop()); } - populateWriterMetrics(shuffleWriter, totalTime, metrics); + populateWriterMetrics(shuffleWriter, totalTime, writerMetrics); + + if (readAfterWrite && cSchema) { + auto readerOptions = ShuffleReaderOptions{}; + readerOptions.shuffleWriterType = shuffleWriter->options().shuffleWriterType; + readerOptions.compressionType = partitionWriterOptions.compressionType; + readerOptions.codecBackend = partitionWriterOptions.codecBackend; + readerOptions.compressionTypeStr = partitionWriterOptions.compressionTypeStr; + + std::shared_ptr schema = + gluten::arrowGetOrThrow(arrow::ImportSchema(reinterpret_cast(cSchema.get()))); + auto reader = runtime->createShuffleReader(schema, readerOptions); + + GLUTEN_ASSIGN_OR_THROW(auto in, arrow::io::ReadableFile::Open(dataFile)); + // Read all partitions. + auto iter = reader->readStream(in); + while (iter->hasNext()) { + // Read and discard. + auto cb = iter->next(); + } + readerMetrics.decompressTime = reader->getDecompressTime(); + readerMetrics.deserializeTime = reader->getDeserializeTime(); + } // Cleanup shuffle outputs cleanupShuffleOutput(dataFile, localDirs, isFromEnv); } @@ -207,20 +260,37 @@ void updateBenchmarkMetrics( ::benchmark::State& state, const int64_t& elapsedTime, const int64_t& readInputTime, - const WriterMetrics& writerMetrics) { + const WriterMetrics& writerMetrics, + const ReaderMetrics& readerMetrics) { state.counters["read_input_time"] = benchmark::Counter(readInputTime, benchmark::Counter::kAvgIterations, benchmark::Counter::OneK::kIs1000); state.counters["elapsed_time"] = benchmark::Counter(elapsedTime, benchmark::Counter::kAvgIterations, benchmark::Counter::OneK::kIs1000); - state.counters["shuffle_write_time"] = benchmark::Counter( - writerMetrics.writeTime, benchmark::Counter::kAvgIterations, benchmark::Counter::OneK::kIs1000); - state.counters["shuffle_spill_time"] = benchmark::Counter( - writerMetrics.evictTime, benchmark::Counter::kAvgIterations, benchmark::Counter::OneK::kIs1000); - state.counters["shuffle_split_time"] = benchmark::Counter( - writerMetrics.splitTime, benchmark::Counter::kAvgIterations, benchmark::Counter::OneK::kIs1000); - state.counters["shuffle_compress_time"] = benchmark::Counter( - writerMetrics.compressTime, benchmark::Counter::kAvgIterations, benchmark::Counter::OneK::kIs1000); + if (FLAGS_run_shuffle || FLAGS_with_shuffle) { + state.counters["shuffle_write_time"] = benchmark::Counter( + writerMetrics.writeTime, benchmark::Counter::kAvgIterations, benchmark::Counter::OneK::kIs1000); + state.counters["shuffle_spill_time"] = benchmark::Counter( + writerMetrics.evictTime, benchmark::Counter::kAvgIterations, benchmark::Counter::OneK::kIs1000); + state.counters["shuffle_compress_time"] = benchmark::Counter( + writerMetrics.compressTime, benchmark::Counter::kAvgIterations, benchmark::Counter::OneK::kIs1000); + state.counters["shuffle_decompress_time"] = benchmark::Counter( + readerMetrics.decompressTime, benchmark::Counter::kAvgIterations, benchmark::Counter::OneK::kIs1000); + state.counters["shuffle_deserialize_time"] = benchmark::Counter( + readerMetrics.deserializeTime, benchmark::Counter::kAvgIterations, benchmark::Counter::OneK::kIs1000); + + auto splitTime = writerMetrics.splitTime; + if (FLAGS_scan_mode == "stream") { + splitTime -= readInputTime; + } + state.counters["shuffle_split_time"] = + benchmark::Counter(splitTime, benchmark::Counter::kAvgIterations, benchmark::Counter::OneK::kIs1000); + + state.counters["shuffle_spilled_bytes"] = benchmark::Counter( + writerMetrics.bytesSpilled, benchmark::Counter::kAvgIterations, benchmark::Counter::OneK::kIs1024); + state.counters["shuffle_write_bytes"] = benchmark::Counter( + writerMetrics.bytesWritten, benchmark::Counter::kAvgIterations, benchmark::Counter::OneK::kIs1024); + } } } // namespace @@ -246,6 +316,7 @@ auto BM_Generic = [](::benchmark::State& state, } WriterMetrics writerMetrics{}; + ReaderMetrics readerMetrics{}; int64_t readInputTime = 0; int64_t elapsedTime = 0; @@ -275,7 +346,7 @@ auto BM_Generic = [](::benchmark::State& state, listenerPtr->setIterator(resultIter.get()); if (FLAGS_with_shuffle) { - runShuffle(runtime, listenerPtr, resultIter, writerMetrics); + runShuffle(runtime, listenerPtr, resultIter, writerMetrics, readerMetrics, false); } else { // May write the output into file. auto veloxPlan = dynamic_cast(runtime)->getVeloxPlan(); @@ -299,7 +370,7 @@ auto BM_Generic = [](::benchmark::State& state, return; } if (FLAGS_print_result) { - LOG(INFO) << maybeBatch.ValueOrDie()->ToString(); + LOG(WARNING) << maybeBatch.ValueOrDie()->ToString(); } if (!FLAGS_save_output.empty()) { GLUTEN_THROW_NOT_OK(writer.writeInBatches(maybeBatch.ValueOrDie())); @@ -322,18 +393,18 @@ auto BM_Generic = [](::benchmark::State& state, const auto* task = rawIter->task(); const auto* planNode = rawIter->veloxPlan(); auto statsStr = facebook::velox::exec::printPlanWithStats(*planNode, task->taskStats(), true); - LOG(INFO) << statsStr; + LOG(WARNING) << statsStr; } } - updateBenchmarkMetrics(state, elapsedTime, readInputTime, writerMetrics); + updateBenchmarkMetrics(state, elapsedTime, readInputTime, writerMetrics, readerMetrics); Runtime::release(runtime); }; -auto BM_ShuffleWrite = [](::benchmark::State& state, - const std::string& inputFile, - RuntimeFactory runtimeFactory, - FileReaderType readerType) { +auto BM_ShuffleWriteRead = [](::benchmark::State& state, + const std::string& inputFile, + RuntimeFactory runtimeFactory, + FileReaderType readerType) { setCpu(state); auto listener = std::make_unique(FLAGS_memory_limit); @@ -341,31 +412,48 @@ auto BM_ShuffleWrite = [](::benchmark::State& state, auto runtime = runtimeFactory(std::move(listener)); WriterMetrics writerMetrics{}; + ReaderMetrics readerMetrics{}; int64_t readInputTime = 0; int64_t elapsedTime = 0; { ScopedTimer timer(&elapsedTime); for (auto _ : state) { auto resultIter = getInputIteratorFromFileReader(inputFile, readerType); - runShuffle(runtime, listenerPtr, resultIter, writerMetrics); + runShuffle(runtime, listenerPtr, resultIter, writerMetrics, readerMetrics, FLAGS_run_shuffle_read); auto reader = static_cast(resultIter->getInputIter()); readInputTime += reader->getCollectBatchTime(); } } - updateBenchmarkMetrics(state, elapsedTime, readInputTime, writerMetrics); + updateBenchmarkMetrics(state, elapsedTime, readInputTime, writerMetrics, readerMetrics); Runtime::release(runtime); }; int main(int argc, char** argv) { - ::benchmark::Initialize(&argc, argv); gflags::ParseCommandLineFlags(&argc, &argv, true); + std::ostringstream ss; + ss << "Setting flags from command line args: " << std::endl; + std::vector flags; + google::GetAllFlags(&flags); + auto filename = std::filesystem::path(__FILE__).filename(); + for (const auto& flag : flags) { + if (std::filesystem::path(flag.filename).filename() == filename) { + ss << " FLAGS_" << flag.name << ": default = " << flag.default_value << ", current = " << flag.current_value + << std::endl; + } + } + LOG(WARNING) << ss.str(); + + ::benchmark::Initialize(&argc, argv); + // Init Velox backend. auto backendConf = gluten::defaultConf(); auto sessionConf = gluten::defaultConf(); - backendConf.insert({gluten::kSparkBatchSize, std::to_string(FLAGS_batch_size)}); + backendConf.insert({gluten::kDebugModeEnabled, std::to_string(FLAGS_debug_mode)}); + backendConf.insert({gluten::kGlogVerboseLevel, std::to_string(FLAGS_v)}); + backendConf.insert({gluten::kGlogSeverityLevel, std::to_string(FLAGS_minloglevel)}); if (!FLAGS_conf.empty()) { abortIfFileNotExists(FLAGS_conf); std::ifstream file(FLAGS_conf); @@ -425,7 +513,7 @@ int main(int argc, char** argv) { std::vector dataFiles{}; if (FLAGS_run_example) { - LOG(INFO) << "Running example..."; + LOG(WARNING) << "Running example..."; dataFiles.resize(2); try { substraitJsonFile = getGeneratedFilePath("example.json"); @@ -484,33 +572,23 @@ int main(int argc, char** argv) { if (!errorMsg.empty()) { LOG(ERROR) << "Incorrect usage: " << errorMsg << std::endl - << "If simulating a first stage, the usage is:" << std::endl - << "./generic_benchmark " - << "--plan /absolute-path/to/substrait_json_file " - << "--split /absolute-path/to/split_json_file_1,/abosolute-path/to/split_json_file_2,..." - << "--data /absolute-path/to/data_file_1,/absolute-path/to/data_file_2,..." << std::endl - << "If simulating a middle stage, the usage is:" << std::endl - << "./generic_benchmark " - << "--plan /absolute-path/to/substrait_json_file " - << "--data /absolute-path/to/data_file_1,/absolute-path/to/data_file_2,..."; - LOG(ERROR) << "*** Please check docs/developers/MicroBenchmarks.md for the full usage. ***"; + << "*** Please check docs/developers/MicroBenchmarks.md for the full usage. ***"; ::benchmark::Shutdown(); std::exit(EXIT_FAILURE); } } - // Check whether input files exist. - LOG(INFO) << "Using substrait json file: " << std::endl << substraitJsonFile; + LOG(WARNING) << "Using substrait json file: " << std::endl << substraitJsonFile; if (!splitFiles.empty()) { - LOG(INFO) << "Using " << splitFiles.size() << " input split file(s): "; + LOG(WARNING) << "Using " << splitFiles.size() << " input split file(s): "; for (const auto& splitFile : splitFiles) { - LOG(INFO) << splitFile; + LOG(WARNING) << splitFile; } } if (!dataFiles.empty()) { - LOG(INFO) << "Using " << dataFiles.size() << " input data file(s): "; + LOG(WARNING) << "Using " << dataFiles.size() << " input data file(s): "; for (const auto& dataFile : dataFiles) { - LOG(INFO) << dataFile; + LOG(WARNING) << dataFile; } } @@ -528,37 +606,28 @@ int main(int argc, char** argv) { setUpBenchmark(bm); \ } while (0) -#define SHUFFLE_WRITE_BENCHMARK(READER_TYPE) \ - do { \ - auto* bm = \ - ::benchmark::RegisterBenchmark("ShuffleWrite", BM_ShuffleWrite, dataFiles[0], runtimeFactory, READER_TYPE) \ - ->MeasureProcessCPUTime() \ - ->UseRealTime(); \ - setUpBenchmark(bm); \ +#define SHUFFLE_WRITE_READ_BENCHMARK(READER_TYPE) \ + do { \ + auto* bm = ::benchmark::RegisterBenchmark( \ + "ShuffleWriteRead", BM_ShuffleWriteRead, dataFiles[0], runtimeFactory, READER_TYPE) \ + ->MeasureProcessCPUTime() \ + ->UseRealTime(); \ + setUpBenchmark(bm); \ } while (0) - LOG(INFO) << "Using options: "; - LOG(INFO) << "threads: " << FLAGS_threads; - LOG(INFO) << "iterations: " << FLAGS_iterations; - LOG(INFO) << "cpu: " << FLAGS_cpu; - LOG(INFO) << "print_result: " << FLAGS_print_result; - LOG(INFO) << "save_output: " << FLAGS_save_output; - LOG(INFO) << "batch_size: " << FLAGS_batch_size; - LOG(INFO) << "write_path: " << FLAGS_write_path; - if (dataFiles.empty()) { GENERIC_BENCHMARK(FileReaderType::kNone); } else { FileReaderType readerType; if (FLAGS_scan_mode == "buffered") { readerType = FileReaderType::kBuffered; - LOG(INFO) << "Using buffered mode for reading parquet data."; + LOG(WARNING) << "Using buffered mode for reading parquet data."; } else { readerType = FileReaderType::kStream; - LOG(INFO) << "Using stream mode for reading parquet data."; + LOG(WARNING) << "Using stream mode for reading parquet data."; } if (FLAGS_run_shuffle) { - SHUFFLE_WRITE_BENCHMARK(readerType); + SHUFFLE_WRITE_READ_BENCHMARK(readerType); } else { GENERIC_BENCHMARK(readerType); } diff --git a/cpp/velox/benchmarks/common/BenchmarkUtils.cc b/cpp/velox/benchmarks/common/BenchmarkUtils.cc index c3baa2f339151..345f9da8e16d1 100644 --- a/cpp/velox/benchmarks/common/BenchmarkUtils.cc +++ b/cpp/velox/benchmarks/common/BenchmarkUtils.cc @@ -159,7 +159,11 @@ setLocalDirsAndDataFileFromEnv(std::string& dataFile, std::vector& // Set local dirs. auto joinedDirs = std::string(joinedDirsC); // Split local dirs and use thread id to choose one directory for data file. - localDirs = gluten::splitPaths(joinedDirs); + auto dirs = gluten::splitPaths(joinedDirs); + for (const auto& dir : dirs) { + localDirs.push_back(arrow::fs::internal::ConcatAbstractPath(dir, "temp_shuffle_" + generateUuid())); + std::filesystem::create_directory(localDirs.back()); + } size_t id = std::hash{}(std::this_thread::get_id()) % localDirs.size(); ARROW_ASSIGN_OR_RAISE(dataFile, gluten::createTempShuffleFile(localDirs[id])); } else { diff --git a/cpp/velox/compute/VeloxBackend.cc b/cpp/velox/compute/VeloxBackend.cc index 2dad6adf2e70e..8dc3ade80dec7 100644 --- a/cpp/velox/compute/VeloxBackend.cc +++ b/cpp/velox/compute/VeloxBackend.cc @@ -45,6 +45,7 @@ DECLARE_bool(velox_exception_user_stacktrace_enabled); DECLARE_int32(velox_memory_num_shared_leaf_pools); DECLARE_bool(velox_memory_use_hugepages); +DECLARE_bool(velox_memory_pool_capacity_transfer_across_tasks); DECLARE_int32(cache_prefetch_min_pct); DECLARE_int32(gluten_velox_aysnc_timeout_on_task_stopping); @@ -63,13 +64,14 @@ gluten::Runtime* veloxRuntimeFactory( } // namespace void VeloxBackend::init(const std::unordered_map& conf) { - backendConf_ = std::make_shared(conf); + backendConf_ = + std::make_shared(std::unordered_map(conf)); // Register Velox runtime factory gluten::Runtime::registerFactory(gluten::kVeloxRuntimeKind, veloxRuntimeFactory); if (backendConf_->get(kDebugModeEnabled, false)) { - LOG(INFO) << "VeloxBackend config:" << printConfig(backendConf_->values()); + LOG(INFO) << "VeloxBackend config:" << printConfig(backendConf_->rawConfigs()); } // Init glog and log level. @@ -77,7 +79,7 @@ void VeloxBackend::init(const std::unordered_map& conf FLAGS_v = backendConf_->get(kGlogVerboseLevel, kGlogVerboseLevelDefault); FLAGS_minloglevel = backendConf_->get(kGlogSeverityLevel, kGlogSeverityLevelDefault); } else { - if (backendConf_->isValueExists(kGlogVerboseLevel)) { + if (backendConf_->valueExists(kGlogVerboseLevel)) { FLAGS_v = backendConf_->get(kGlogVerboseLevel, kGlogVerboseLevelDefault); } else { FLAGS_v = kGlogVerboseLevelMaximum; @@ -86,6 +88,9 @@ void VeloxBackend::init(const std::unordered_map& conf FLAGS_logtostderr = true; google::InitGoogleLogging("gluten"); + // Allow growing buffer in another task through its memory pool. + FLAGS_velox_memory_pool_capacity_transfer_across_tasks = true; + // Avoid creating too many shared leaf pools. FLAGS_velox_memory_num_shared_leaf_pools = 0; @@ -187,15 +192,15 @@ void VeloxBackend::initCache() { void VeloxBackend::initConnector() { // The configs below are used at process level. - std::unordered_map connectorConfMap = backendConf_->values(); + std::unordered_map connectorConfMap = backendConf_->rawConfigs(); auto hiveConf = getHiveConfig(backendConf_); - for (auto& [k, v] : hiveConf->valuesCopy()) { + for (auto& [k, v] : hiveConf->rawConfigsCopy()) { connectorConfMap[k] = v; } #ifdef ENABLE_ABFS - const auto& confValue = backendConf_->values(); + const auto& confValue = backendConf_->rawConfigs(); for (auto& [k, v] : confValue) { if (k.find("fs.azure.account.key") == 0) { connectorConfMap[k] = v; @@ -205,6 +210,7 @@ void VeloxBackend::initConnector() { } } #endif + connectorConfMap[velox::connector::hive::HiveConfig::kEnableFileHandleCache] = backendConf_->get(kVeloxFileHandleCacheEnabled, kVeloxFileHandleCacheEnabledDefault) ? "true" : "false"; @@ -233,7 +239,7 @@ void VeloxBackend::initConnector() { } velox::connector::registerConnector(std::make_shared( kHiveConnectorId, - std::make_shared(std::move(connectorConfMap)), + std::make_shared(std::move(connectorConfMap)), ioExecutor_.get())); } diff --git a/cpp/velox/compute/VeloxBackend.h b/cpp/velox/compute/VeloxBackend.h index e8298eeed1921..df04428559787 100644 --- a/cpp/velox/compute/VeloxBackend.h +++ b/cpp/velox/compute/VeloxBackend.h @@ -25,9 +25,9 @@ #include #include "velox/common/caching/AsyncDataCache.h" +#include "velox/common/config/Config.h" #include "velox/common/memory/MemoryPool.h" #include "velox/common/memory/MmapAllocator.h" -#include "velox/core/Config.h" namespace gluten { /// As a static instance in per executor, initialized at executor startup. @@ -53,7 +53,7 @@ class VeloxBackend { facebook::velox::cache::AsyncDataCache* getAsyncDataCache() const; - std::shared_ptr getBackendConf() const { + std::shared_ptr getBackendConf() const { return backendConf_; } @@ -92,7 +92,7 @@ class VeloxBackend { std::string cachePathPrefix_; std::string cacheFilePrefix_; - std::shared_ptr backendConf_; + std::shared_ptr backendConf_; }; } // namespace gluten diff --git a/cpp/velox/compute/VeloxRuntime.cc b/cpp/velox/compute/VeloxRuntime.cc index cdce781bd528a..996fcb850727a 100644 --- a/cpp/velox/compute/VeloxRuntime.cc +++ b/cpp/velox/compute/VeloxRuntime.cc @@ -62,7 +62,8 @@ VeloxRuntime::VeloxRuntime( : Runtime(std::make_shared(std::move(listener)), confMap) { // Refresh session config. vmm_ = dynamic_cast(memoryManager_.get()); - veloxCfg_ = std::make_shared(confMap_); + veloxCfg_ = + std::make_shared(std::unordered_map(confMap_)); debugModeEnabled_ = veloxCfg_->get(kDebugModeEnabled, false); FLAGS_minloglevel = veloxCfg_->get(kGlogSeverityLevel, FLAGS_minloglevel); FLAGS_v = veloxCfg_->get(kGlogVerboseLevel, FLAGS_v); @@ -270,7 +271,7 @@ std::unique_ptr VeloxRuntime::createColumnarBatchSerial } void VeloxRuntime::dumpConf(const std::string& path) { - const auto& backendConfMap = VeloxBackend::get()->getBackendConf()->values(); + const auto& backendConfMap = VeloxBackend::get()->getBackendConf()->rawConfigs(); auto allConfMap = backendConfMap; for (const auto& pair : confMap_) { diff --git a/cpp/velox/compute/VeloxRuntime.h b/cpp/velox/compute/VeloxRuntime.h index 952a103ed8ad2..3460677d9113a 100644 --- a/cpp/velox/compute/VeloxRuntime.h +++ b/cpp/velox/compute/VeloxRuntime.h @@ -101,7 +101,7 @@ class VeloxRuntime final : public Runtime { private: VeloxMemoryManager* vmm_; std::shared_ptr veloxPlan_; - std::shared_ptr veloxCfg_; + std::shared_ptr veloxCfg_; bool debugModeEnabled_{false}; std::unordered_map> emptySchemaBatchLoopUp_; diff --git a/cpp/velox/compute/WholeStageResultIterator.cc b/cpp/velox/compute/WholeStageResultIterator.cc index 4544c01653d97..2edf9a5731216 100644 --- a/cpp/velox/compute/WholeStageResultIterator.cc +++ b/cpp/velox/compute/WholeStageResultIterator.cc @@ -62,7 +62,8 @@ WholeStageResultIterator::WholeStageResultIterator( const std::unordered_map& confMap, const SparkTaskInfo& taskInfo) : memoryManager_(memoryManager), - veloxCfg_(std::make_shared(confMap)), + veloxCfg_( + std::make_shared(std::unordered_map(confMap))), taskInfo_(taskInfo), veloxPlan_(planNode), scanNodeIds_(scanNodeIds), @@ -175,7 +176,7 @@ WholeStageResultIterator::WholeStageResultIterator( } std::shared_ptr WholeStageResultIterator::createNewVeloxQueryCtx() { - std::unordered_map> connectorConfigs; + std::unordered_map> connectorConfigs; connectorConfigs[kHiveConnectorId] = createConnectorConfig(); std::shared_ptr ctx = velox::core::QueryCtx::create( @@ -194,7 +195,22 @@ std::shared_ptr WholeStageResultIterator::next() { if (task_->isFinished()) { return nullptr; } - velox::RowVectorPtr vector = task_->next(); + velox::RowVectorPtr vector; + while (true) { + auto future = velox::ContinueFuture::makeEmpty(); + auto out = task_->next(&future); + if (!future.valid()) { + // Not need to wait. Break. + vector = std::move(out); + break; + } + // Velox suggested to wait. This might be because another thread (e.g., background io thread) is spilling the task. + GLUTEN_CHECK(out == nullptr, "Expected to wait but still got non-null output from Velox task"); + VLOG(2) << "Velox task " << task_->taskId() + << " is busy when ::next() is called. Will wait and try again. Task state: " + << taskStateString(task_->state()); + future.wait(); + } if (vector == nullptr) { return nullptr; } @@ -209,29 +225,6 @@ std::shared_ptr WholeStageResultIterator::next() { return std::make_shared(vector); } -namespace { -class SuspendedSection { - public: - SuspendedSection() { - reclaimer_->enterArbitration(); - } - - virtual ~SuspendedSection() { - reclaimer_->leaveArbitration(); - } - - // singleton - SuspendedSection(const SuspendedSection&) = delete; - SuspendedSection(SuspendedSection&&) = delete; - SuspendedSection& operator=(const SuspendedSection&) = delete; - SuspendedSection& operator=(SuspendedSection&&) = delete; - - private: - // We only use suspension APIs in exec::MemoryReclaimer. - std::unique_ptr reclaimer_{velox::exec::MemoryReclaimer::create()}; -}; -} // namespace - int64_t WholeStageResultIterator::spillFixedSize(int64_t size) { auto pool = memoryManager_->getAggregateMemoryPool(); std::string poolName{pool->root()->name() + "/" + pool->name()}; @@ -241,11 +234,8 @@ int64_t WholeStageResultIterator::spillFixedSize(int64_t size) { if (spillStrategy_ == "auto") { int64_t remaining = size - shrunken; LOG(INFO) << logPrefix << "Trying to request spilling for " << remaining << " bytes..."; - // suspend the driver when we are on it - SuspendedSection suspender; - velox::exec::MemoryReclaimer::Stats status; auto* mm = memoryManager_->getMemoryManager(); - uint64_t spilledOut = mm->arbitrator()->shrinkCapacity({pool}, remaining); // this conducts spilling + uint64_t spilledOut = mm->arbitrator()->shrinkCapacity(remaining); // this conducts spilling LOG(INFO) << logPrefix << "Successfully spilled out " << spilledOut << " bytes."; uint64_t total = shrunken + spilledOut; VLOG(2) << logPrefix << "Successfully reclaimed total " << total << " bytes."; @@ -437,10 +427,10 @@ std::unordered_map WholeStageResultIterator::getQueryC // Find offheap size from Spark confs. If found, set the max memory usage of partial aggregation. // FIXME this uses process-wise off-heap memory which is not for task try { - if (veloxCfg_->isValueExists(kDefaultSessionTimezone)) { + if (veloxCfg_->valueExists(kDefaultSessionTimezone)) { configs[velox::core::QueryConfig::kSessionTimezone] = veloxCfg_->get(kDefaultSessionTimezone, ""); } - if (veloxCfg_->isValueExists(kSessionTimezone)) { + if (veloxCfg_->valueExists(kSessionTimezone)) { configs[velox::core::QueryConfig::kSessionTimezone] = veloxCfg_->get(kSessionTimezone, ""); } // Adjust timestamp according to the above configured session timezone. @@ -519,7 +509,7 @@ std::unordered_map WholeStageResultIterator::getQueryC return configs; } -std::shared_ptr WholeStageResultIterator::createConnectorConfig() { +std::shared_ptr WholeStageResultIterator::createConnectorConfig() { // The configs below are used at session level. std::unordered_map configs = {}; // The semantics of reading as lower case is opposite with case-sensitive. @@ -532,7 +522,7 @@ std::shared_ptr WholeStageResultIterator::createConnectorConfig() std::to_string(veloxCfg_->get(kMaxPartitions, 10000)); configs[velox::connector::hive::HiveConfig::kIgnoreMissingFilesSession] = std::to_string(veloxCfg_->get(kIgnoreMissingFiles, false)); - return std::make_shared(configs); + return std::make_shared(std::move(configs)); } } // namespace gluten diff --git a/cpp/velox/compute/WholeStageResultIterator.h b/cpp/velox/compute/WholeStageResultIterator.h index 5e661f40485a1..371ec0c14c531 100644 --- a/cpp/velox/compute/WholeStageResultIterator.h +++ b/cpp/velox/compute/WholeStageResultIterator.h @@ -23,8 +23,8 @@ #include "substrait/SubstraitToVeloxPlan.h" #include "substrait/plan.pb.h" #include "utils/metrics.h" +#include "velox/common/config/Config.h" #include "velox/connectors/hive/iceberg/IcebergSplit.h" -#include "velox/core/Config.h" #include "velox/core/PlanNode.h" #include "velox/exec/Task.h" @@ -80,7 +80,7 @@ class WholeStageResultIterator : public ColumnarBatchIterator { std::vector& nodeIds); /// Create connector config. - std::shared_ptr createConnectorConfig(); + std::shared_ptr createConnectorConfig(); /// Construct partition columns. void constructPartitionColumns( @@ -103,7 +103,7 @@ class WholeStageResultIterator : public ColumnarBatchIterator { VeloxMemoryManager* memoryManager_; /// Config, task and plan. - std::shared_ptr veloxCfg_; + std::shared_ptr veloxCfg_; const SparkTaskInfo taskInfo_; std::shared_ptr task_; std::shared_ptr veloxPlan_; diff --git a/cpp/velox/jni/JniFileSystem.cc b/cpp/velox/jni/JniFileSystem.cc index 8bf791c234e80..55c1dc734969a 100644 --- a/cpp/velox/jni/JniFileSystem.cc +++ b/cpp/velox/jni/JniFileSystem.cc @@ -261,7 +261,8 @@ class FileSystemWrapper : public facebook::velox::filesystems::FileSystem { class JniFileSystem : public facebook::velox::filesystems::FileSystem { public: - explicit JniFileSystem(jobject obj, std::shared_ptr config) : FileSystem(config) { + explicit JniFileSystem(jobject obj, std::shared_ptr config) + : FileSystem(config) { JNIEnv* env; attachCurrentThreadAsDaemonOrThrow(vm, &env); obj_ = env->NewGlobalRef(obj); @@ -374,9 +375,10 @@ class JniFileSystem : public facebook::velox::filesystems::FileSystem { return [](std::string_view filePath) { return filePath.find(kJniFsScheme) == 0; }; } - static std::function(std::shared_ptr, std::string_view)> + static std::function< + std::shared_ptr(std::shared_ptr, std::string_view)> fileSystemGenerator() { - return [](std::shared_ptr properties, std::string_view filePath) { + return [](std::shared_ptr properties, std::string_view filePath) { JNIEnv* env; attachCurrentThreadAsDaemonOrThrow(vm, &env); jobject obj = env->CallStaticObjectMethod(jniFileSystemClass, jniGetFileSystem); @@ -455,7 +457,7 @@ void gluten::registerJolFileSystem(uint64_t maxFileSize) { auto fileSystemGenerator = [maxFileSize]( - std::shared_ptr properties, + std::shared_ptr properties, std::string_view filePath) -> std::shared_ptr { // select JNI file if there is enough space if (JniFileSystem::isCapableForNewFile(maxFileSize)) { diff --git a/cpp/velox/jni/JniUdf.cc b/cpp/velox/jni/JniUdf.cc index cab90b325fe50..8230724f12602 100644 --- a/cpp/velox/jni/JniUdf.cc +++ b/cpp/velox/jni/JniUdf.cc @@ -41,8 +41,8 @@ void gluten::initVeloxJniUDF(JNIEnv* env) { udfResolverClass = createGlobalClassReferenceOrError(env, kUdfResolverClassPath.c_str()); // methods - registerUDFMethod = getMethodIdOrError(env, udfResolverClass, "registerUDF", "(Ljava/lang/String;[B[BZ)V"); - registerUDAFMethod = getMethodIdOrError(env, udfResolverClass, "registerUDAF", "(Ljava/lang/String;[B[B[BZ)V"); + registerUDFMethod = getMethodIdOrError(env, udfResolverClass, "registerUDF", "(Ljava/lang/String;[B[BZZ)V"); + registerUDAFMethod = getMethodIdOrError(env, udfResolverClass, "registerUDAF", "(Ljava/lang/String;[B[B[BZZ)V"); } void gluten::finalizeVeloxJniUDF(JNIEnv* env) { @@ -71,9 +71,23 @@ void gluten::jniGetFunctionSignatures(JNIEnv* env) { signature->intermediateType.length(), reinterpret_cast(signature->intermediateType.c_str())); env->CallVoidMethod( - instance, registerUDAFMethod, name, returnType, argTypes, intermediateType, signature->variableArity); + instance, + registerUDAFMethod, + name, + returnType, + argTypes, + intermediateType, + signature->variableArity, + signature->allowTypeConversion); } else { - env->CallVoidMethod(instance, registerUDFMethod, name, returnType, argTypes, signature->variableArity); + env->CallVoidMethod( + instance, + registerUDFMethod, + name, + returnType, + argTypes, + signature->variableArity, + signature->allowTypeConversion); } checkException(env); } diff --git a/cpp/velox/jni/VeloxJniWrapper.cc b/cpp/velox/jni/VeloxJniWrapper.cc index cb49abd7d4668..5df3a478e6674 100644 --- a/cpp/velox/jni/VeloxJniWrapper.cc +++ b/cpp/velox/jni/VeloxJniWrapper.cc @@ -33,6 +33,7 @@ #include "utils/ObjectStore.h" #include "utils/VeloxBatchResizer.h" #include "velox/common/base/BloomFilter.h" +#include "velox/common/file/FileSystems.h" #include @@ -260,6 +261,24 @@ JNIEXPORT jlong JNICALL Java_org_apache_gluten_utils_VeloxBatchResizerJniWrapper JNI_METHOD_END(gluten::kInvalidObjectHandle) } +JNIEXPORT jboolean JNICALL +Java_org_apache_gluten_utils_VeloxFileSystemValidationJniWrapper_allSupportedByRegisteredFileSystems( // NOLINT + JNIEnv* env, + jclass, + jobjectArray stringArray) { + JNI_METHOD_START + int size = env->GetArrayLength(stringArray); + for (int i = 0; i < size; i++) { + jstring string = (jstring)(env->GetObjectArrayElement(stringArray, i)); + std::string path = jStringToCString(env, string); + if (!velox::filesystems::isPathSupportedByRegisteredFileSystems(path)) { + return false; + } + } + return true; + JNI_METHOD_END(false) +} + #ifdef __cplusplus } #endif diff --git a/cpp/velox/memory/VeloxMemoryManager.cc b/cpp/velox/memory/VeloxMemoryManager.cc index 442090004a417..6b5606dd228e1 100644 --- a/cpp/velox/memory/VeloxMemoryManager.cc +++ b/cpp/velox/memory/VeloxMemoryManager.cc @@ -35,61 +35,96 @@ namespace gluten { using namespace facebook; +namespace { + +static constexpr std::string_view kMemoryPoolInitialCapacity{"memory-pool-initial-capacity"}; +static constexpr uint64_t kDefaultMemoryPoolInitialCapacity{256 << 20}; +static constexpr std::string_view kMemoryPoolTransferCapacity{"memory-pool-transfer-capacity"}; +static constexpr uint64_t kDefaultMemoryPoolTransferCapacity{128 << 20}; + +template +T getConfig( + const std::unordered_map& configs, + const std::string_view& key, + const T& defaultValue) { + if (configs.count(std::string(key)) > 0) { + try { + return folly::to(configs.at(std::string(key))); + } catch (const std::exception& e) { + VELOX_USER_FAIL("Failed while parsing SharedArbitrator configs: {}", e.what()); + } + } + return defaultValue; +} +} // namespace /// We assume in a single Spark task. No thread-safety should be guaranteed. class ListenableArbitrator : public velox::memory::MemoryArbitrator { public: ListenableArbitrator(const Config& config, AllocationListener* listener) - : MemoryArbitrator(config), listener_(listener) {} - + : MemoryArbitrator(config), + listener_(listener), + memoryPoolInitialCapacity_(velox::config::toCapacity( + getConfig( + config.extraConfigs, + kMemoryPoolInitialCapacity, + std::to_string(kDefaultMemoryPoolInitialCapacity)), + velox::config::CapacityUnit::BYTE)), + memoryPoolTransferCapacity_(velox::config::toCapacity( + getConfig( + config.extraConfigs, + kMemoryPoolTransferCapacity, + std::to_string(kDefaultMemoryPoolTransferCapacity)), + velox::config::CapacityUnit::BYTE)) {} std::string kind() const override { return kind_; } - uint64_t growCapacity(velox::memory::MemoryPool* pool, uint64_t targetBytes) override { - std::lock_guard l(mutex_); - listener_->allocationChanged(targetBytes); - if (!growPool(pool, targetBytes, 0)) { - VELOX_FAIL("Failed to grow root pool's capacity for {}", velox::succinctBytes(targetBytes)); - } - return targetBytes; + void addPool(const std::shared_ptr& pool) override { + VELOX_CHECK_EQ(pool->capacity(), 0); + + std::unique_lock guard{mutex_}; + VELOX_CHECK_EQ(candidates_.count(pool.get()), 0); + candidates_.emplace(pool.get(), pool->weak_from_this()); } - uint64_t shrinkCapacity(velox::memory::MemoryPool* pool, uint64_t targetBytes) override { - std::lock_guard l(mutex_); - return shrinkCapacityLocked(pool, targetBytes); + void removePool(velox::memory::MemoryPool* pool) override { + VELOX_CHECK_EQ(pool->reservedBytes(), 0); + shrinkCapacity(pool, pool->capacity()); + + std::unique_lock guard{mutex_}; + const auto ret = candidates_.erase(pool); + VELOX_CHECK_EQ(ret, 1); } - bool growCapacity( - velox::memory::MemoryPool* pool, - const std::vector>& candidatePools, - uint64_t targetBytes) override { + bool growCapacity(velox::memory::MemoryPool* pool, uint64_t targetBytes) override { velox::memory::ScopedMemoryArbitrationContext ctx(pool); - VELOX_CHECK_EQ(candidatePools.size(), 1, "ListenableArbitrator should only be used within a single root pool") - auto candidate = candidatePools.back(); - VELOX_CHECK(pool->root() == candidate.get(), "Illegal state in ListenableArbitrator"); + velox::memory::MemoryPool* candidate; + { + std::unique_lock guard{mutex_}; + VELOX_CHECK_EQ(candidates_.size(), 1, "ListenableArbitrator should only be used within a single root pool") + candidate = candidates_.begin()->first; + } + VELOX_CHECK(pool->root() == candidate, "Illegal state in ListenableArbitrator"); - std::lock_guard l(mutex_); - growCapacityLocked(pool->root(), targetBytes); + growCapacity0(pool->root(), targetBytes); return true; } - uint64_t shrinkCapacity( - const std::vector>& pools, - uint64_t targetBytes, - bool allowSpill, - bool allowAbort) override { + uint64_t shrinkCapacity(uint64_t targetBytes, bool allowSpill, bool allowAbort) override { velox::memory::ScopedMemoryArbitrationContext ctx((const velox::memory::MemoryPool*)nullptr); facebook::velox::exec::MemoryReclaimer::Stats status; - VELOX_CHECK_EQ(pools.size(), 1, "Gluten only has one root pool"); - std::lock_guard l(mutex_); // FIXME: Do we have recursive locking for this mutex? - auto pool = pools.at(0); - const uint64_t oldCapacity = pool->capacity(); + velox::memory::MemoryPool* pool; + { + std::unique_lock guard{mutex_}; + VELOX_CHECK_EQ(candidates_.size(), 1, "ListenableArbitrator should only be used within a single root pool") + pool = candidates_.begin()->first; + } pool->reclaim(targetBytes, 0, status); // ignore the output - shrinkPool(pool.get(), 0); - const uint64_t newCapacity = pool->capacity(); - uint64_t total = oldCapacity - newCapacity; - listener_->allocationChanged(-total); - return total; + return shrinkCapacity0(pool, 0); + } + + uint64_t shrinkCapacity(velox::memory::MemoryPool* pool, uint64_t targetBytes) override { + return shrinkCapacity0(pool, targetBytes); } Stats stats() const override { @@ -102,7 +137,7 @@ class ListenableArbitrator : public velox::memory::MemoryArbitrator { } private: - void growCapacityLocked(velox::memory::MemoryPool* pool, uint64_t bytes) { + void growCapacity0(velox::memory::MemoryPool* pool, uint64_t bytes) { // Since // https://github.com/facebookincubator/velox/pull/9557/files#diff-436e44b7374032f8f5d7eb45869602add6f955162daa2798d01cc82f8725724dL812-L820, // We should pass bytes as parameter "reservationBytes" when calling ::grow. @@ -124,15 +159,19 @@ class ListenableArbitrator : public velox::memory::MemoryArbitrator { pool->toString()) } - uint64_t shrinkCapacityLocked(velox::memory::MemoryPool* pool, uint64_t bytes) { + uint64_t shrinkCapacity0(velox::memory::MemoryPool* pool, uint64_t bytes) { uint64_t freeBytes = shrinkPool(pool, bytes); listener_->allocationChanged(-freeBytes); return freeBytes; } gluten::AllocationListener* listener_; - std::recursive_mutex mutex_; + const uint64_t memoryPoolInitialCapacity_; // FIXME: Unused. + const uint64_t memoryPoolTransferCapacity_; + + mutable std::mutex mutex_; inline static std::string kind_ = "GLUTEN"; + std::unordered_map> candidates_; }; class ArbitratorFactoryRegister { @@ -220,10 +259,7 @@ int64_t shrinkVeloxMemoryPool(velox::memory::MemoryManager* mm, velox::memory::M VLOG(2) << logPrefix << "Pool has reserved " << pool->usedBytes() << "/" << pool->root()->reservedBytes() << "/" << pool->root()->capacity() << "/" << pool->root()->maxCapacity() << " bytes."; VLOG(2) << logPrefix << "Shrinking..."; - const uint64_t oldCapacity = pool->capacity(); - mm->arbitrator()->shrinkCapacity(pool, 0); - const uint64_t newCapacity = pool->capacity(); - int64_t shrunken = oldCapacity - newCapacity; + auto shrunken = mm->arbitrator()->shrinkCapacity(pool, 0); VLOG(2) << logPrefix << shrunken << " bytes released from shrinking."; return shrunken; } diff --git a/cpp/velox/operators/functions/Arithmetic.h b/cpp/velox/operators/functions/Arithmetic.h index 0474e1554981f..7b4c9ae9db7c6 100644 --- a/cpp/velox/operators/functions/Arithmetic.h +++ b/cpp/velox/operators/functions/Arithmetic.h @@ -17,6 +17,7 @@ #include #include #include +#include #include namespace gluten { @@ -38,14 +39,16 @@ struct RoundFunction { return number; } - double factor = std::pow(10, decimals); + // Using long double for high precision during intermediate calculations. + // TODO: Make this more efficient with Boost to support high arbitrary precision at runtime. + long double factor = std::pow(10.0L, static_cast(decimals)); static const TNum kInf = std::numeric_limits::infinity(); + if (number < 0) { - return (std::round(std::nextafter(number, -kInf) * factor * -1) / factor) * -1; + return static_cast((std::round(std::nextafter(number, -kInf) * factor * -1) / factor) * -1); } - return std::round(std::nextafter(number, kInf) * factor) / factor; + return static_cast(std::round(std::nextafter(number, kInf) * factor) / factor); } - template FOLLY_ALWAYS_INLINE void call(TInput& result, const TInput& a, const int32_t b = 0) { result = round(a, b); diff --git a/cpp/velox/operators/serializer/VeloxColumnarBatchSerializer.cc b/cpp/velox/operators/serializer/VeloxColumnarBatchSerializer.cc index 8b21e7bdbbeb8..acb14cf4de391 100644 --- a/cpp/velox/operators/serializer/VeloxColumnarBatchSerializer.cc +++ b/cpp/velox/operators/serializer/VeloxColumnarBatchSerializer.cc @@ -34,7 +34,7 @@ namespace { std::unique_ptr toByteStream(uint8_t* data, int32_t size) { std::vector byteRanges; byteRanges.push_back(ByteRange{data, size, 0}); - auto byteStream = std::make_unique(byteRanges); + auto byteStream = std::make_unique(byteRanges); return byteStream; } } // namespace diff --git a/cpp/velox/operators/writer/VeloxParquetDatasourceABFS.h b/cpp/velox/operators/writer/VeloxParquetDatasourceABFS.h index 82e8f794cbce5..b542e42babd47 100644 --- a/cpp/velox/operators/writer/VeloxParquetDatasourceABFS.h +++ b/cpp/velox/operators/writer/VeloxParquetDatasourceABFS.h @@ -43,7 +43,8 @@ class VeloxParquetDatasourceABFS final : public VeloxParquetDatasource { : VeloxParquetDatasource(filePath, veloxPool, sinkPool, schema) {} void initSink(const std::unordered_map& sparkConfs) override { - auto hiveConf = getHiveConfig(std::make_shared(sparkConfs)); + auto hiveConf = getHiveConfig(std::make_shared( + std::unordered_map(sparkConfs))); auto fileSystem = filesystems::getFileSystem(filePath_, hiveConf); auto* abfsFileSystem = dynamic_cast(fileSystem.get()); sink_ = std::make_unique( diff --git a/cpp/velox/operators/writer/VeloxParquetDatasourceHDFS.h b/cpp/velox/operators/writer/VeloxParquetDatasourceHDFS.h index 7722c8e51993f..19e9e35606489 100644 --- a/cpp/velox/operators/writer/VeloxParquetDatasourceHDFS.h +++ b/cpp/velox/operators/writer/VeloxParquetDatasourceHDFS.h @@ -43,7 +43,8 @@ class VeloxParquetDatasourceHDFS final : public VeloxParquetDatasource { : VeloxParquetDatasource(filePath, veloxPool, sinkPool, schema) {} void initSink(const std::unordered_map& sparkConfs) override { - auto hiveConf = getHiveConfig(std::make_shared(sparkConfs)); + auto hiveConf = getHiveConfig(std::make_shared( + std::unordered_map(sparkConfs))); sink_ = dwio::common::FileSink::create(filePath_, {.connectorProperties = hiveConf, .pool = sinkPool_.get()}); } }; diff --git a/cpp/velox/operators/writer/VeloxParquetDatasourceS3.h b/cpp/velox/operators/writer/VeloxParquetDatasourceS3.h index 3231a8a1ee5cd..8219fe42a3c4c 100644 --- a/cpp/velox/operators/writer/VeloxParquetDatasourceS3.h +++ b/cpp/velox/operators/writer/VeloxParquetDatasourceS3.h @@ -43,7 +43,8 @@ class VeloxParquetDatasourceS3 final : public VeloxParquetDatasource { : VeloxParquetDatasource(filePath, veloxPool, sinkPool, schema) {} void initSink(const std::unordered_map& sparkConfs) override { - auto hiveConf = getHiveConfig(std::make_shared(sparkConfs)); + auto hiveConf = getHiveConfig(std::make_shared( + std::unordered_map(sparkConfs))); sink_ = dwio::common::FileSink::create(filePath_, {.connectorProperties = hiveConf, .pool = sinkPool_.get()}); } }; diff --git a/cpp/velox/shuffle/GlutenByteStream.h b/cpp/velox/shuffle/GlutenByteStream.h new file mode 100644 index 0000000000000..78ea7b905adc3 --- /dev/null +++ b/cpp/velox/shuffle/GlutenByteStream.h @@ -0,0 +1,267 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// TODO: wait to delete after rss sort reader refactored. +#include "velox/common/memory/ByteStream.h" + +namespace facebook::velox { +class GlutenByteInputStream : public ByteInputStream { + protected: + /// TODO Remove after refactoring SpillInput. + GlutenByteInputStream() {} + + public: + explicit GlutenByteInputStream(std::vector ranges) { + ranges_ = std::move(ranges); + VELOX_CHECK(!ranges_.empty()); + current_ = &ranges_[0]; + } + + /// Disable copy constructor. + GlutenByteInputStream(const GlutenByteInputStream&) = delete; + + /// Disable copy assignment operator. + GlutenByteInputStream& operator=(const GlutenByteInputStream& other) = delete; + + /// Enable move constructor. + GlutenByteInputStream(GlutenByteInputStream&& other) noexcept = delete; + + /// Enable move assignment operator. + GlutenByteInputStream& operator=(GlutenByteInputStream&& other) noexcept { + if (this != &other) { + ranges_ = std::move(other.ranges_); + current_ = other.current_; + other.current_ = nullptr; + } + return *this; + } + + /// TODO Remove after refactoring SpillInput. + virtual ~GlutenByteInputStream() = default; + + /// Returns total number of bytes available in the stream. + size_t size() const { + size_t total = 0; + for (const auto& range : ranges_) { + total += range.size; + } + return total; + } + + /// Returns true if all input has been read. + /// + /// TODO: Remove 'virtual' after refactoring SpillInput. + virtual bool atEnd() const { + if (!current_) { + return false; + } + if (current_->position < current_->size) { + return false; + } + + VELOX_CHECK(current_ >= ranges_.data() && current_ <= &ranges_.back()); + return current_ == &ranges_.back(); + } + + /// Returns current position (number of bytes from the start) in the stream. + std::streampos tellp() const { + if (ranges_.empty()) { + return 0; + } + VELOX_DCHECK_NOT_NULL(current_); + int64_t size = 0; + for (auto& range : ranges_) { + if (&range == current_) { + return current_->position + size; + } + size += range.size; + } + VELOX_FAIL("GlutenByteInputStream 'current_' is not in 'ranges_'."); + } + + /// Moves current position to specified one. + void seekp(std::streampos position) { + if (ranges_.empty() && position == 0) { + return; + } + int64_t toSkip = position; + for (auto& range : ranges_) { + if (toSkip <= range.size) { + current_ = ⦥ + current_->position = toSkip; + return; + } + toSkip -= range.size; + } + static_assert(sizeof(std::streamsize) <= sizeof(long long)); + VELOX_FAIL("Seeking past end of GlutenByteInputStream: {}", static_cast(position)); + } + + /// Returns the remaining size left from current reading position. + size_t remainingSize() const { + if (ranges_.empty()) { + return 0; + } + const auto* lastRange = &ranges_[ranges_.size() - 1]; + auto cur = current_; + size_t total = cur->size - cur->position; + while (++cur <= lastRange) { + total += cur->size; + } + return total; + } + + std::string toString() const { + std::stringstream oss; + oss << ranges_.size() << " ranges (position/size) ["; + for (const auto& range : ranges_) { + oss << "(" << range.position << "/" << range.size << (&range == current_ ? " current" : "") << ")"; + if (&range != &ranges_.back()) { + oss << ","; + } + } + oss << "]"; + return oss.str(); + } + + uint8_t readByte() { + if (current_->position < current_->size) { + return current_->buffer[current_->position++]; + } + next(); + return readByte(); + } + + void readBytes(uint8_t* bytes, int32_t size) { + VELOX_CHECK_GE(size, 0, "Attempting to read negative number of bytes"); + int32_t offset = 0; + for (;;) { + int32_t available = current_->size - current_->position; + int32_t numUsed = std::min(available, size); + simd::memcpy(bytes + offset, current_->buffer + current_->position, numUsed); + offset += numUsed; + size -= numUsed; + current_->position += numUsed; + if (!size) { + return; + } + next(); + } + } + + template + T read() { + if (current_->position + sizeof(T) <= current_->size) { + current_->position += sizeof(T); + return *reinterpret_cast(current_->buffer + current_->position - sizeof(T)); + } + // The number straddles two buffers. We read byte by byte and make + // a little-endian uint64_t. The bytes can be cast to any integer + // or floating point type since the wire format has the machine byte order. + static_assert(sizeof(T) <= sizeof(uint64_t)); + uint64_t value = 0; + for (int32_t i = 0; i < sizeof(T); ++i) { + value |= static_cast(readByte()) << (i * 8); + } + return *reinterpret_cast(&value); + } + + template + void readBytes(Char* data, int32_t size) { + readBytes(reinterpret_cast(data), size); + } + + /// Returns a view over the read buffer for up to 'size' next + /// bytes. The size of the value may be less if the current byte + /// range ends within 'size' bytes from the current position. The + /// size will be 0 if at end. + std::string_view nextView(int32_t size) { + VELOX_CHECK_GE(size, 0, "Attempting to view negative number of bytes"); + if (current_->position == current_->size) { + if (current_ == &ranges_.back()) { + return std::string_view(nullptr, 0); + } + next(); + } + VELOX_CHECK(current_->size); + auto position = current_->position; + auto viewSize = std::min(current_->size - current_->position, size); + current_->position += viewSize; + return std::string_view(reinterpret_cast(current_->buffer) + position, viewSize); + } + + void skip(int32_t size) { + VELOX_CHECK_GE(size, 0, "Attempting to skip negative number of bytes"); + for (;;) { + int32_t available = current_->size - current_->position; + int32_t numUsed = std::min(available, size); + size -= numUsed; + current_->position += numUsed; + if (!size) { + return; + } + next(); + } + } + + protected: + /// Sets 'current_' to point to the next range of input. // The + /// input is consecutive ByteRanges in 'ranges_' for the base class + /// but any view over external buffers can be made by specialization. + /// + /// TODO: Remove 'virtual' after refactoring SpillInput. + virtual void next(bool throwIfPastEnd = true) { + VELOX_CHECK(current_ >= &ranges_[0]); + size_t position = current_ - &ranges_[0]; + VELOX_CHECK_LT(position, ranges_.size()); + if (position == ranges_.size() - 1) { + if (throwIfPastEnd) { + VELOX_FAIL("Reading past end of GlutenByteInputStream"); + } + return; + } + ++current_; + current_->position = 0; + } + + // TODO: Remove after refactoring SpillInput. + const std::vector& ranges() const { + return ranges_; + } + + // TODO: Remove after refactoring SpillInput. + void setRange(ByteRange range) { + ranges_.resize(1); + ranges_[0] = range; + current_ = ranges_.data(); + } +}; + +template <> +inline Timestamp GlutenByteInputStream::read() { + Timestamp value; + readBytes(reinterpret_cast(&value), sizeof(value)); + return value; +} + +template <> +inline int128_t GlutenByteInputStream::read() { + int128_t value; + readBytes(reinterpret_cast(&value), sizeof(value)); + return value; +} +} // namespace facebook::velox diff --git a/cpp/velox/shuffle/VeloxHashShuffleWriter.cc b/cpp/velox/shuffle/VeloxHashShuffleWriter.cc index e165d4a91da8f..00d8be16656ef 100644 --- a/cpp/velox/shuffle/VeloxHashShuffleWriter.cc +++ b/cpp/velox/shuffle/VeloxHashShuffleWriter.cc @@ -1364,6 +1364,7 @@ arrow::Result VeloxHashShuffleWriter::evictPartitionBuffersMinSize(int6 auto pid = item.first; ARROW_ASSIGN_OR_RAISE(auto buffers, assembleBuffers(pid, false)); auto payload = std::make_unique(item.second, &isValidityBuffer_, std::move(buffers)); + metrics_.totalBytesToEvict += payload->rawSize(); RETURN_NOT_OK(partitionWriter_->evict(pid, std::move(payload), Evict::kSpill, false, hasComplexType_, false)); evicted = beforeEvict - partitionBufferPool_->bytes_allocated(); if (evicted >= size) { diff --git a/cpp/velox/shuffle/VeloxShuffleReader.cc b/cpp/velox/shuffle/VeloxShuffleReader.cc index ab93d9a33d04a..3966857b9e9d0 100644 --- a/cpp/velox/shuffle/VeloxShuffleReader.cc +++ b/cpp/velox/shuffle/VeloxShuffleReader.cc @@ -16,6 +16,7 @@ */ #include "VeloxShuffleReader.h" +#include "GlutenByteStream.h" #include #include @@ -177,7 +178,7 @@ VectorPtr readFlatVector( std::unique_ptr toByteStream(uint8_t* data, int32_t size) { std::vector byteRanges; byteRanges.push_back(ByteRange{data, size, 0}); - auto byteStream = std::make_unique(byteRanges); + auto byteStream = std::make_unique(byteRanges); return byteStream; } @@ -312,8 +313,7 @@ std::shared_ptr VeloxHashShuffleReaderDeserializer::next() { if (hasComplexType_) { uint32_t numRows; GLUTEN_ASSIGN_OR_THROW( - auto arrowBuffers, - BlockPayload::deserialize(in_.get(), schema_, codec_, memoryPool_, numRows, decompressTime_)); + auto arrowBuffers, BlockPayload::deserialize(in_.get(), codec_, memoryPool_, numRows, decompressTime_)); if (numRows == 0) { // Reach EOS. return nullptr; @@ -332,7 +332,7 @@ std::shared_ptr VeloxHashShuffleReaderDeserializer::next() { uint32_t numRows = 0; while (!merged_ || merged_->numRows() < batchSize_) { GLUTEN_ASSIGN_OR_THROW( - arrowBuffers, BlockPayload::deserialize(in_.get(), schema_, codec_, memoryPool_, numRows, decompressTime_)); + arrowBuffers, BlockPayload::deserialize(in_.get(), codec_, memoryPool_, numRows, decompressTime_)); if (numRows == 0) { reachEos_ = true; break; @@ -401,7 +401,7 @@ std::shared_ptr VeloxSortShuffleReaderDeserializer::next() { while (cachedRows_ < batchSize_) { uint32_t numRows; GLUTEN_ASSIGN_OR_THROW( - auto arrowBuffers, BlockPayload::deserialize(in_.get(), schema_, codec_, arrowPool_, numRows, decompressTime_)); + auto arrowBuffers, BlockPayload::deserialize(in_.get(), codec_, arrowPool_, numRows, decompressTime_)); if (numRows == 0) { reachEos_ = true; @@ -451,7 +451,7 @@ std::shared_ptr VeloxSortShuffleReaderDeserializer::deserializeTo return std::make_shared(std::move(rowVector)); } -class VeloxRssSortShuffleReaderDeserializer::VeloxInputStream : public facebook::velox::ByteInputStream { +class VeloxRssSortShuffleReaderDeserializer::VeloxInputStream : public facebook::velox::GlutenByteInputStream { public: VeloxInputStream(std::shared_ptr input, facebook::velox::BufferPtr buffer); diff --git a/cpp/velox/shuffle/VeloxSortShuffleWriter.cc b/cpp/velox/shuffle/VeloxSortShuffleWriter.cc index c0d9b467d98c8..2bfc4908d2f66 100644 --- a/cpp/velox/shuffle/VeloxSortShuffleWriter.cc +++ b/cpp/velox/shuffle/VeloxSortShuffleWriter.cc @@ -105,7 +105,7 @@ arrow::Status VeloxSortShuffleWriter::init() { ARROW_RETURN_IF( options_.partitioning == Partitioning::kSingle, arrow::Status::Invalid("VeloxSortShuffleWriter doesn't support single partition.")); - initArray(); + allocateMinimalArray(); sortedBuffer_ = facebook::velox::AlignedBuffer::allocate(kSortedBufferSize, veloxPool_.get()); rawBuffer_ = sortedBuffer_->asMutable(); return arrow::Status::OK(); @@ -260,15 +260,16 @@ arrow::Status VeloxSortShuffleWriter::evictAllPartitions() { pageCursor_ = 0; // Reset and reallocate array_ to minimal size. Allocate array_ can trigger spill. - initArray(); + allocateMinimalArray(); } return arrow::Status::OK(); } arrow::Status VeloxSortShuffleWriter::evictPartition(uint32_t partitionId, size_t begin, size_t end) { - ScopedTimer timer(&sortTime_); + // Count copy row time into sortTime_. + Timer sortTime{}; // Serialize [begin, end) - uint64_t offset = 0; + int64_t offset = 0; char* addr; uint32_t size; @@ -278,13 +279,9 @@ arrow::Status VeloxSortShuffleWriter::evictPartition(uint32_t partitionId, size_ addr = pageAddresses_[pageIndex.first] + pageIndex.second; size = *(RowSizeType*)addr; if (offset + size > kSortedBufferSize) { - VELOX_CHECK(offset > 0); - auto payload = std::make_unique( - index - begin, - nullptr, - std::vector>{std::make_shared(rawBuffer_, offset)}); - RETURN_NOT_OK( - partitionWriter_->evict(partitionId, std::move(payload), Evict::type::kSortSpill, false, false, stopped_)); + sortTime.stop(); + RETURN_NOT_OK(evictPartition0(partitionId, index - begin, offset)); + sortTime.start(); begin = index; offset = 0; } @@ -292,45 +289,69 @@ arrow::Status VeloxSortShuffleWriter::evictPartition(uint32_t partitionId, size_ offset += size; index++; } + sortTime.stop(); + RETURN_NOT_OK(evictPartition0(partitionId, index - begin, offset)); + + sortTime_ += sortTime.realTimeUsed(); + return arrow::Status::OK(); +} + +arrow::Status VeloxSortShuffleWriter::evictPartition0(uint32_t partitionId, uint32_t numRows, int64_t rawLength) { + VELOX_CHECK(rawLength > 0); auto payload = std::make_unique( - end - begin, + numRows, nullptr, - std::vector>{std::make_shared(rawBuffer_, offset)}); + std::vector>{std::make_shared(rawBuffer_, rawLength)}); + updateSpillMetrics(payload); RETURN_NOT_OK( partitionWriter_->evict(partitionId, std::move(payload), Evict::type::kSortSpill, false, false, stopped_)); return arrow::Status::OK(); } -uint32_t VeloxSortShuffleWriter::maxRowsToInsert(uint32_t offset, uint32_t rows) { +uint32_t VeloxSortShuffleWriter::maxRowsToInsert(uint32_t offset, uint32_t remainingRows) { // Check how many rows can be handled. if (pages_.empty()) { return 0; } auto remainingBytes = pages_.back()->size() - pageCursor_; if (fixedRowSize_) { - return std::min((uint32_t)(remainingBytes / (fixedRowSize_.value())), rows); + return std::min((uint32_t)(remainingBytes / (fixedRowSize_.value())), remainingRows); } auto beginIter = rowSizePrefixSum_.begin() + 1 + offset; - auto iter = std::upper_bound(beginIter, rowSizePrefixSum_.end(), remainingBytes); + auto bytesWritten = rowSizePrefixSum_[offset]; + auto iter = std::upper_bound(beginIter, rowSizePrefixSum_.end(), remainingBytes + bytesWritten); return iter - beginIter; } void VeloxSortShuffleWriter::acquireNewBuffer(uint64_t memLimit, uint64_t minSizeRequired) { - auto size = std::max(std::min(memLimit >> 2, 64UL * 1024 * 1024), minSizeRequired); + DLOG_IF(INFO, !pages_.empty()) << "Acquire new buffer. current capacity: " << pages_.back()->capacity() + << ", size: " << pages_.back()->size() << ", pageCursor: " << pageCursor_ + << ", unused: " << pages_.back()->capacity() - pageCursor_; + auto size = std::max( + std::min( + std::max(memLimit >> 2, facebook::velox::AlignedBuffer::kPaddedSize), 64UL * 1024 * 1024) - + facebook::velox::AlignedBuffer::kPaddedSize, + minSizeRequired); // Allocating new buffer can trigger spill. - auto newBuffer = facebook::velox::AlignedBuffer::allocate(size, veloxPool_.get(), 0); + auto newBuffer = facebook::velox::AlignedBuffer::allocate(size, veloxPool_.get()); + DLOG(INFO) << "Allocated new buffer. capacity: " << newBuffer->capacity() << ", size: " << newBuffer->size(); + auto newBufferSize = newBuffer->capacity(); + newBuffer->setSize(newBufferSize); + + currentPage_ = newBuffer->asMutable(); + currenPageSize_ = newBufferSize; + memset(currentPage_, 0, newBufferSize); + // If spill triggered, clear pages_. if (offset_ == 0 && pages_.size() > 0) { pageAddresses_.clear(); pages_.clear(); } - currentPage_ = newBuffer->asMutable(); pageAddresses_.emplace_back(currentPage_); pages_.emplace_back(std::move(newBuffer)); pageCursor_ = 0; pageNumber_ = pages_.size() - 1; - currenPageSize_ = pages_.back()->size(); } void VeloxSortShuffleWriter::growArrayIfNecessary(uint32_t rows) { @@ -339,16 +360,12 @@ void VeloxSortShuffleWriter::growArrayIfNecessary(uint32_t rows) { // May trigger spill. auto newSizeBytes = newSize * sizeof(uint64_t); auto newArray = facebook::velox::AlignedBuffer::allocate(newSizeBytes, veloxPool_.get()); - // Check if already satisfies. + // Check if already satisfies (spill has been triggered). if (newArraySize(rows) > arraySize_) { - auto newPtr = newArray->asMutable(); if (offset_ > 0) { - gluten::fastCopy(newPtr, arrayPtr_, offset_ * sizeof(uint64_t)); + gluten::fastCopy(newArray->asMutable(), arrayPtr_, offset_ * sizeof(uint64_t)); } - arraySize_ = newSize; - arrayPtr_ = newPtr; - array_.reset(); - array_.swap(newArray); + setUpArray(std::move(newArray)); } } } @@ -363,9 +380,13 @@ uint32_t VeloxSortShuffleWriter::newArraySize(uint32_t rows) { return newSize; } -void VeloxSortShuffleWriter::initArray() { - arraySize_ = options_.sortBufferInitialSize; - array_ = facebook::velox::AlignedBuffer::allocate(arraySize_ * sizeof(uint64_t), veloxPool_.get()); +void VeloxSortShuffleWriter::setUpArray(facebook::velox::BufferPtr&& array) { + array_.reset(); + array_ = std::move(array); + // Capacity is a multiple of 8 (bytes). + auto capacity = array_->capacity() & 0xfffffff8; + array_->setSize(capacity); + arraySize_ = capacity >> 3; arrayPtr_ = array_->asMutable(); } @@ -381,8 +402,15 @@ int64_t VeloxSortShuffleWriter::totalC2RTime() const { return c2rTime_; } -int VeloxSortShuffleWriter::compare(const void* a, const void* b) { - // No same values. - return *(uint64_t*)a > *(uint64_t*)b ? 1 : -1; +void VeloxSortShuffleWriter::allocateMinimalArray() { + auto array = facebook::velox::AlignedBuffer::allocate( + options_.sortBufferInitialSize * sizeof(uint64_t), veloxPool_.get()); + setUpArray(std::move(array)); +} + +void VeloxSortShuffleWriter::updateSpillMetrics(const std::unique_ptr& payload) { + if (!stopped_) { + metrics_.totalBytesToEvict += payload->rawSize(); + } } } // namespace gluten diff --git a/cpp/velox/shuffle/VeloxSortShuffleWriter.h b/cpp/velox/shuffle/VeloxSortShuffleWriter.h index 69b8b25030955..1626573a7dc6e 100644 --- a/cpp/velox/shuffle/VeloxSortShuffleWriter.h +++ b/cpp/velox/shuffle/VeloxSortShuffleWriter.h @@ -77,7 +77,9 @@ class VeloxSortShuffleWriter final : public VeloxShuffleWriter { arrow::Status evictPartition(uint32_t partitionId, size_t begin, size_t end); - uint32_t maxRowsToInsert(uint32_t offset, uint32_t rows); + arrow::Status evictPartition0(uint32_t partitionId, uint32_t numRows, int64_t rawLength); + + uint32_t maxRowsToInsert(uint32_t offset, uint32_t remainingRows); void acquireNewBuffer(uint64_t memLimit, uint64_t minSizeRequired); @@ -85,9 +87,11 @@ class VeloxSortShuffleWriter final : public VeloxShuffleWriter { uint32_t newArraySize(uint32_t rows); - void initArray(); + void setUpArray(facebook::velox::BufferPtr&& array); + + void allocateMinimalArray(); - static int compare(const void* a, const void* b); + void updateSpillMetrics(const std::unique_ptr& payload); // Stores compact row id -> row facebook::velox::BufferPtr array_; diff --git a/cpp/velox/substrait/SubstraitToVeloxPlan.cc b/cpp/velox/substrait/SubstraitToVeloxPlan.cc index d7de841191ed0..1604c15e338a4 100644 --- a/cpp/velox/substrait/SubstraitToVeloxPlan.cc +++ b/cpp/velox/substrait/SubstraitToVeloxPlan.cc @@ -1148,7 +1148,8 @@ core::PlanNodePtr SubstraitToVeloxPlanConverter::toVeloxPlan(const ::substrait:: std::vector veloxTypeList; std::vector columnTypes; // Convert field names into lower case when not case-sensitive. - std::unique_ptr veloxCfg = std::make_unique(confMap_); + std::unique_ptr veloxCfg = + std::make_unique(std::unordered_map(confMap_)); bool asLowerCase = !veloxCfg->get(kCaseSensitive, false); if (readRel.has_base_schema()) { const auto& baseSchema = readRel.base_schema(); diff --git a/cpp/velox/tests/CMakeLists.txt b/cpp/velox/tests/CMakeLists.txt index f7bc1cb13ee79..b8ea12e944aae 100644 --- a/cpp/velox/tests/CMakeLists.txt +++ b/cpp/velox/tests/CMakeLists.txt @@ -39,9 +39,13 @@ set(VELOX_TEST_COMMON_SRCS JsonToProtoConverter.cc FilePathGenerator.cc) add_velox_test(velox_shuffle_writer_test SOURCES VeloxShuffleWriterTest.cc) # TODO: ORC is not well supported. add_velox_test(orc_test SOURCES OrcTest.cc) add_velox_test( - velox_operators_test SOURCES VeloxColumnarToRowTest.cc - VeloxRowToColumnarTest.cc VeloxColumnarBatchSerializerTest.cc - VeloxColumnarBatchTest.cc) + velox_operators_test + SOURCES + VeloxColumnarToRowTest.cc + VeloxRowToColumnarTest.cc + VeloxColumnarBatchSerializerTest.cc + VeloxColumnarBatchTest.cc + VeloxBatchResizerTest.cc) add_velox_test( velox_plan_conversion_test SOURCES diff --git a/cpp/velox/tests/MemoryManagerTest.cc b/cpp/velox/tests/MemoryManagerTest.cc index d86bd46e230d3..bb102dc2d8c31 100644 --- a/cpp/velox/tests/MemoryManagerTest.cc +++ b/cpp/velox/tests/MemoryManagerTest.cc @@ -128,4 +128,274 @@ TEST_F(MemoryManagerTest, memoryAllocatorWithBlockReservation) { ASSERT_EQ(allocator_->getBytes(), 0); } +namespace { +class AllocationListenerWrapper : public AllocationListener { + public: + explicit AllocationListenerWrapper() {} + + void set(AllocationListener* const delegate) { + if (delegate_ != nullptr) { + throw std::runtime_error("Invalid state"); + } + delegate_ = delegate; + } + + void allocationChanged(int64_t diff) override { + delegate_->allocationChanged(diff); + } + int64_t currentBytes() override { + return delegate_->currentBytes(); + } + int64_t peakBytes() override { + return delegate_->peakBytes(); + } + + private: + AllocationListener* delegate_{nullptr}; +}; + +class SpillableAllocationListener : public AllocationListener { + public: + virtual uint64_t shrink(uint64_t bytes) = 0; + virtual uint64_t spill(uint64_t bytes) = 0; +}; + +class MockSparkTaskMemoryManager { + public: + explicit MockSparkTaskMemoryManager(const uint64_t maxBytes); + + AllocationListener* newListener(std::function shrink, std::function spill); + + uint64_t acquire(uint64_t bytes); + void release(uint64_t bytes); + uint64_t currentBytes() { + return currentBytes_; + } + + private: + mutable std::recursive_mutex mutex_; + std::vector> listeners_{}; + + const uint64_t maxBytes_; + uint64_t currentBytes_{0L}; +}; + +class MockSparkAllocationListener : public SpillableAllocationListener { + public: + explicit MockSparkAllocationListener( + MockSparkTaskMemoryManager* const manager, + std::function shrink, + std::function spill) + : manager_(manager), shrink_(shrink), spill_(spill) {} + + void allocationChanged(int64_t diff) override { + if (diff == 0) { + return; + } + if (diff > 0) { + auto granted = manager_->acquire(diff); + if (granted < diff) { + throw std::runtime_error("OOM"); + } + currentBytes_ += granted; + return; + } + manager_->release(-diff); + currentBytes_ -= (-diff); + } + + uint64_t shrink(uint64_t bytes) override { + return shrink_(bytes); + } + + uint64_t spill(uint64_t bytes) override { + return spill_(bytes); + } + + int64_t currentBytes() override { + return currentBytes_; + } + + private: + MockSparkTaskMemoryManager* const manager_; + std::function shrink_; + std::function spill_; + std::atomic currentBytes_{0L}; +}; + +MockSparkTaskMemoryManager::MockSparkTaskMemoryManager(const uint64_t maxBytes) : maxBytes_(maxBytes) {} + +AllocationListener* MockSparkTaskMemoryManager::newListener( + std::function shrink, + std::function spill) { + listeners_.push_back(std::make_unique(this, shrink, spill)); + return listeners_.back().get(); +} + +uint64_t MockSparkTaskMemoryManager::acquire(uint64_t bytes) { + std::unique_lock l(mutex_); + auto freeBytes = maxBytes_ - currentBytes_; + if (bytes <= freeBytes) { + currentBytes_ += bytes; + return bytes; + } + // Shrink listeners. + int64_t bytesNeeded = bytes - freeBytes; + for (const auto& listener : listeners_) { + bytesNeeded -= listener->shrink(bytesNeeded); + if (bytesNeeded < 0) { + break; + } + } + if (bytesNeeded > 0) { + for (const auto& listener : listeners_) { + bytesNeeded -= listener->spill(bytesNeeded); + if (bytesNeeded < 0) { + break; + } + } + } + + if (bytesNeeded > 0) { + uint64_t granted = bytes - bytesNeeded; + currentBytes_ += granted; + return granted; + } + + currentBytes_ += bytes; + return bytes; +} + +void MockSparkTaskMemoryManager::release(uint64_t bytes) { + std::unique_lock l(mutex_); + currentBytes_ -= bytes; +} + +class MockMemoryReclaimer : public facebook::velox::memory::MemoryReclaimer { + public: + explicit MockMemoryReclaimer(std::vector& buffs, int32_t size) : buffs_(buffs), size_(size) {} + + bool reclaimableBytes(const memory::MemoryPool& pool, uint64_t& reclaimableBytes) const override { + uint64_t total = 0; + for (const auto& buf : buffs_) { + if (buf == nullptr) { + continue; + } + total += size_; + } + if (total == 0) { + return false; + } + reclaimableBytes = total; + return true; + } + + uint64_t reclaim(memory::MemoryPool* pool, uint64_t targetBytes, uint64_t maxWaitMs, Stats& stats) override { + uint64_t total = 0; + for (auto& buf : buffs_) { + if (buf == nullptr) { + // When: + // 1. Called by allocation from the same pool so buff is not allocated yet. + // 2. Already called once. + continue; + } + pool->free(buf, size_); + buf = nullptr; + total += size_; + } + return total; + } + + private: + std::vector& buffs_; + int32_t size_; +}; + +void assertCapacitiesMatch(MockSparkTaskMemoryManager& tmm, std::vector>& vmms) { + uint64_t sum = 0; + for (const auto& vmm : vmms) { + if (vmm == nullptr) { + continue; + } + sum += vmm->getAggregateMemoryPool()->capacity(); + } + if (tmm.currentBytes() != sum) { + ASSERT_EQ(tmm.currentBytes(), sum); + } +} +} // namespace + +class MultiMemoryManagerTest : public ::testing::Test { + protected: + static void SetUpTestCase() { + std::unordered_map conf = { + {kMemoryReservationBlockSize, std::to_string(kMemoryReservationBlockSizeDefault)}, + {kVeloxMemInitCapacity, std::to_string(kVeloxMemInitCapacityDefault)}}; + gluten::VeloxBackend::create(conf); + } + + std::unique_ptr newVeloxMemoryManager(std::unique_ptr listener) { + return std::make_unique(std::move(listener)); + } +}; + +TEST_F(MultiMemoryManagerTest, spill) { + const uint64_t maxBytes = 200 << 20; + const uint32_t numThreads = 100; + const uint32_t numAllocations = 200; + const int32_t allocateSize = 10 << 20; + + MockSparkTaskMemoryManager tmm{maxBytes}; + std::vector> vmms{}; + std::vector threads{}; + std::vector> buffs{}; + for (size_t i = 0; i < numThreads; ++i) { + buffs.push_back({}); + vmms.emplace_back(nullptr); + } + + // Emulate a shared lock to avoid ABBA deadlock. + std::recursive_mutex mutex; + + for (size_t i = 0; i < numThreads; ++i) { + threads.emplace_back([this, i, allocateSize, &tmm, &vmms, &mutex, &buffs]() -> void { + auto wrapper = std::make_unique(); // Set later. + auto* listener = wrapper.get(); + + facebook::velox::memory::MemoryPool* pool; // Set later. + { + std::unique_lock l(mutex); + vmms[i] = newVeloxMemoryManager(std::move(wrapper)); + pool = vmms[i]->getLeafMemoryPool().get(); + pool->setReclaimer(std::make_unique(buffs[i], allocateSize)); + listener->set(tmm.newListener( + [](uint64_t bytes) -> uint64_t { return 0; }, + [i, &vmms, &mutex](uint64_t bytes) -> uint64_t { + std::unique_lock l(mutex); + return vmms[i]->getMemoryManager()->arbitrator()->shrinkCapacity(bytes); + })); + } + { + std::unique_lock l(mutex); + for (size_t j = 0; j < numAllocations; ++j) { + assertCapacitiesMatch(tmm, vmms); + buffs[i].push_back(pool->allocate(allocateSize)); + assertCapacitiesMatch(tmm, vmms); + } + } + }); + } + + for (auto& thread : threads) { + thread.join(); + } + + for (auto& vmm : vmms) { + assertCapacitiesMatch(tmm, vmms); + vmm->getMemoryManager()->arbitrator()->shrinkCapacity(allocateSize * numAllocations); + assertCapacitiesMatch(tmm, vmms); + } + + ASSERT_EQ(tmm.currentBytes(), 0); +} } // namespace gluten diff --git a/cpp/velox/tests/VeloxBatchResizerTest.cc b/cpp/velox/tests/VeloxBatchResizerTest.cc new file mode 100644 index 0000000000000..aecd52f927cc8 --- /dev/null +++ b/cpp/velox/tests/VeloxBatchResizerTest.cc @@ -0,0 +1,85 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include "utils/VeloxBatchResizer.h" +#include "velox/vector/tests/utils/VectorTestBase.h" + +using namespace facebook::velox; + +namespace gluten { +class ColumnarBatchArray : public ColumnarBatchIterator { + public: + explicit ColumnarBatchArray(const std::vector> batches) + : batches_(std::move(batches)) {} + + std::shared_ptr next() override { + if (cursor_ >= batches_.size()) { + return nullptr; + } + return batches_[cursor_++]; + } + + private: + const std::vector> batches_; + int32_t cursor_ = 0; +}; + +class VeloxBatchResizerTest : public ::testing::Test, public test::VectorTestBase { + protected: + static void SetUpTestCase() { + memory::MemoryManager::testingSetInstance({}); + } + + RowVectorPtr newVector(size_t numRows) { + auto constant = makeConstant(1, numRows); + auto out = + std::make_shared(pool(), ROW({INTEGER()}), nullptr, numRows, std::vector{constant}); + return out; + } + + void checkResize(int32_t min, int32_t max, std::vector inSizes, std::vector outSizes) { + auto inBatches = std::vector>(); + for (const auto& size : inSizes) { + inBatches.push_back(std::make_shared(newVector(size))); + } + VeloxBatchResizer resizer(pool(), min, max, std::make_unique(std::move(inBatches))); + auto actualOutSizes = std::vector(); + while (true) { + auto next = resizer.next(); + if (next == nullptr) { + break; + } + actualOutSizes.push_back(next->numRows()); + } + ASSERT_EQ(actualOutSizes, outSizes); + } +}; + +TEST_F(VeloxBatchResizerTest, sanity) { + checkResize(100, std::numeric_limits::max(), {30, 50, 30, 40, 30}, {110, 70}); + checkResize(1, 40, {10, 20, 50, 30, 40, 30}, {10, 20, 40, 10, 30, 40, 30}); + checkResize(1, 39, {10, 20, 50, 30, 40, 30}, {10, 20, 39, 11, 30, 39, 1, 30}); + checkResize(40, 40, {10, 20, 50, 30, 40, 30}, {30, 40, 10, 30, 40, 30}); + checkResize(39, 39, {10, 20, 50, 30, 40, 30}, {30, 39, 11, 30, 39, 1, 30}); + checkResize(100, 200, {5, 900, 50}, {5, 200, 200, 200, 200, 100, 50}); + checkResize(100, 200, {5, 900, 30, 80}, {5, 200, 200, 200, 200, 100, 110}); + checkResize(100, 200, {5, 900, 700}, {5, 200, 200, 200, 200, 100, 200, 200, 200, 100}); + ASSERT_ANY_THROW(checkResize(0, 0, {}, {})); +} +} // namespace gluten diff --git a/cpp/velox/tests/VeloxShuffleWriterTest.cc b/cpp/velox/tests/VeloxShuffleWriterTest.cc index af9d5a58db0d4..7cbfbcd79cc95 100644 --- a/cpp/velox/tests/VeloxShuffleWriterTest.cc +++ b/cpp/velox/tests/VeloxShuffleWriterTest.cc @@ -407,7 +407,22 @@ TEST_P(RoundRobinPartitioningShuffleWriter, spillVerifyResult) { shuffleWriteReadMultiBlocks(*shuffleWriter, 2, inputVector1_->type(), {{blockPid1}, {blockPid2}}); } -TEST_F(VeloxShuffleWriterMemoryTest, memoryLeak) { +TEST_P(RoundRobinPartitioningShuffleWriter, sortMaxRows) { + if (GetParam().shuffleWriterType != kSortShuffle) { + return; + } + ASSERT_NOT_OK(initShuffleWriterOptions()); + auto shuffleWriter = createShuffleWriter(defaultArrowMemoryPool().get()); + + // Set memLimit to 0 to force allocate a new buffer for each row. + ASSERT_NOT_OK(splitRowVector(*shuffleWriter, inputVector1_, 0)); + + auto blockPid1 = takeRows({inputVector1_}, {{0, 2, 4, 6, 8}}); + auto blockPid2 = takeRows({inputVector1_}, {{1, 3, 5, 7, 9}}); + shuffleWriteReadMultiBlocks(*shuffleWriter, 2, inputVector1_->type(), {{blockPid1}, {blockPid2}}); +} + +TEST_F(VeloxHashShuffleWriterMemoryTest, memoryLeak) { ASSERT_NOT_OK(initShuffleWriterOptions()); std::shared_ptr pool = std::make_shared(); shuffleWriterOptions_.bufferSize = 4; @@ -425,7 +440,7 @@ TEST_F(VeloxShuffleWriterMemoryTest, memoryLeak) { ASSERT_TRUE(pool->bytes_allocated() == 0); } -TEST_F(VeloxShuffleWriterMemoryTest, spillFailWithOutOfMemory) { +TEST_F(VeloxHashShuffleWriterMemoryTest, spillFailWithOutOfMemory) { ASSERT_NOT_OK(initShuffleWriterOptions()); std::shared_ptr pool = std::make_shared(0); shuffleWriterOptions_.bufferSize = 4; @@ -438,7 +453,7 @@ TEST_F(VeloxShuffleWriterMemoryTest, spillFailWithOutOfMemory) { ASSERT_TRUE(status.IsOutOfMemory()); } -TEST_F(VeloxShuffleWriterMemoryTest, kInit) { +TEST_F(VeloxHashShuffleWriterMemoryTest, kInit) { ASSERT_NOT_OK(initShuffleWriterOptions()); shuffleWriterOptions_.bufferSize = 4; auto shuffleWriter = createShuffleWriter(defaultArrowMemoryPool().get()); @@ -508,7 +523,7 @@ TEST_F(VeloxShuffleWriterMemoryTest, kInit) { ASSERT_NOT_OK(shuffleWriter->stop()); } -TEST_F(VeloxShuffleWriterMemoryTest, kInitSingle) { +TEST_F(VeloxHashShuffleWriterMemoryTest, kInitSingle) { ASSERT_NOT_OK(initShuffleWriterOptions()); shuffleWriterOptions_.partitioning = Partitioning::kSingle; shuffleWriterOptions_.bufferSize = 4; @@ -530,7 +545,7 @@ TEST_F(VeloxShuffleWriterMemoryTest, kInitSingle) { ASSERT_NOT_OK(shuffleWriter->stop()); } -TEST_F(VeloxShuffleWriterMemoryTest, kSplit) { +TEST_F(VeloxHashShuffleWriterMemoryTest, kSplit) { ASSERT_NOT_OK(initShuffleWriterOptions()); shuffleWriterOptions_.bufferSize = 4; auto pool = SelfEvictedMemoryPool(defaultArrowMemoryPool().get(), false); @@ -552,7 +567,7 @@ TEST_F(VeloxShuffleWriterMemoryTest, kSplit) { ASSERT_NOT_OK(shuffleWriter->stop()); } -TEST_F(VeloxShuffleWriterMemoryTest, kSplitSingle) { +TEST_F(VeloxHashShuffleWriterMemoryTest, kSplitSingle) { ASSERT_NOT_OK(initShuffleWriterOptions()); shuffleWriterOptions_.partitioning = Partitioning::kSingle; auto pool = SelfEvictedMemoryPool(defaultArrowMemoryPool().get(), false); @@ -570,7 +585,7 @@ TEST_F(VeloxShuffleWriterMemoryTest, kSplitSingle) { ASSERT_NOT_OK(shuffleWriter->stop()); } -TEST_F(VeloxShuffleWriterMemoryTest, kStop) { +TEST_F(VeloxHashShuffleWriterMemoryTest, kStop) { for (const auto partitioning : {Partitioning::kSingle, Partitioning::kRoundRobin}) { ASSERT_NOT_OK(initShuffleWriterOptions()); shuffleWriterOptions_.partitioning = partitioning; @@ -592,7 +607,7 @@ TEST_F(VeloxShuffleWriterMemoryTest, kStop) { } } -TEST_F(VeloxShuffleWriterMemoryTest, kStopComplex) { +TEST_F(VeloxHashShuffleWriterMemoryTest, kStopComplex) { ASSERT_NOT_OK(initShuffleWriterOptions()); shuffleWriterOptions_.bufferSize = 4; auto pool = SelfEvictedMemoryPool(defaultArrowMemoryPool().get(), false); @@ -613,7 +628,7 @@ TEST_F(VeloxShuffleWriterMemoryTest, kStopComplex) { ASSERT_TRUE(pool.checkEvict(pool.bytes_allocated(), [&] { ASSERT_NOT_OK(shuffleWriter->stop()); })); } -TEST_F(VeloxShuffleWriterMemoryTest, evictPartitionBuffers) { +TEST_F(VeloxHashShuffleWriterMemoryTest, evictPartitionBuffers) { ASSERT_NOT_OK(initShuffleWriterOptions()); shuffleWriterOptions_.bufferSize = 4; auto pool = SelfEvictedMemoryPool(defaultArrowMemoryPool().get(), false); @@ -635,7 +650,7 @@ TEST_F(VeloxShuffleWriterMemoryTest, evictPartitionBuffers) { ASSERT_EQ(shuffleWriter->partitionBufferSize(), 0); } -TEST_F(VeloxShuffleWriterMemoryTest, kUnevictableSingle) { +TEST_F(VeloxHashShuffleWriterMemoryTest, kUnevictableSingle) { ASSERT_NOT_OK(initShuffleWriterOptions()); shuffleWriterOptions_.partitioning = Partitioning::kSingle; auto pool = SelfEvictedMemoryPool(defaultArrowMemoryPool().get()); @@ -657,7 +672,7 @@ TEST_F(VeloxShuffleWriterMemoryTest, kUnevictableSingle) { ASSERT_EQ(evicted, 0); } -TEST_F(VeloxShuffleWriterMemoryTest, resizeBinaryBufferTriggerSpill) { +TEST_F(VeloxHashShuffleWriterMemoryTest, resizeBinaryBufferTriggerSpill) { ASSERT_NOT_OK(initShuffleWriterOptions()); shuffleWriterOptions_.bufferReallocThreshold = 1; auto pool = SelfEvictedMemoryPool(defaultArrowMemoryPool().get(), false); diff --git a/cpp/velox/udf/Udaf.h b/cpp/velox/udf/Udaf.h index 2f292fbc6cb34..4555bdfdf8a34 100644 --- a/cpp/velox/udf/Udaf.h +++ b/cpp/velox/udf/Udaf.h @@ -28,6 +28,7 @@ struct UdafEntry { const char* intermediateType{nullptr}; bool variableArity{false}; + bool allowTypeConversion{false}; }; #define GLUTEN_GET_NUM_UDAF getNumUdaf diff --git a/cpp/velox/udf/Udf.h b/cpp/velox/udf/Udf.h index a32bdaefe9ec4..e0b3a70004e8c 100644 --- a/cpp/velox/udf/Udf.h +++ b/cpp/velox/udf/Udf.h @@ -27,6 +27,7 @@ struct UdfEntry { const char** argTypes; bool variableArity{false}; + bool allowTypeConversion{false}; }; #define GLUTEN_GET_NUM_UDF getNumUdf diff --git a/cpp/velox/udf/UdfLoader.cc b/cpp/velox/udf/UdfLoader.cc index 02aa410a95e10..8a99181662188 100644 --- a/cpp/velox/udf/UdfLoader.cc +++ b/cpp/velox/udf/UdfLoader.cc @@ -86,7 +86,8 @@ std::unordered_set> UdfLoader::getRegis const auto& entry = udfEntries[i]; auto dataType = toSubstraitTypeStr(entry.dataType); auto argTypes = toSubstraitTypeStr(entry.numArgs, entry.argTypes); - signatures_.insert(std::make_shared(entry.name, dataType, argTypes, entry.variableArity)); + signatures_.insert(std::make_shared( + entry.name, dataType, argTypes, entry.variableArity, entry.allowTypeConversion)); } free(udfEntries); } else { @@ -110,8 +111,8 @@ std::unordered_set> UdfLoader::getRegis auto dataType = toSubstraitTypeStr(entry.dataType); auto argTypes = toSubstraitTypeStr(entry.numArgs, entry.argTypes); auto intermediateType = toSubstraitTypeStr(entry.intermediateType); - signatures_.insert( - std::make_shared(entry.name, dataType, argTypes, intermediateType, entry.variableArity)); + signatures_.insert(std::make_shared( + entry.name, dataType, argTypes, intermediateType, entry.variableArity, entry.allowTypeConversion)); } free(udafEntries); } else { diff --git a/cpp/velox/udf/UdfLoader.h b/cpp/velox/udf/UdfLoader.h index 2783beb855119..51264e67cc4d7 100644 --- a/cpp/velox/udf/UdfLoader.h +++ b/cpp/velox/udf/UdfLoader.h @@ -37,21 +37,33 @@ class UdfLoader { std::string intermediateType{}; bool variableArity; + bool allowTypeConversion; - UdfSignature(std::string name, std::string returnType, std::string argTypes, bool variableArity) - : name(name), returnType(returnType), argTypes(argTypes), variableArity(variableArity) {} + UdfSignature( + std::string name, + std::string returnType, + std::string argTypes, + bool variableArity, + bool allowTypeConversion) + : name(name), + returnType(returnType), + argTypes(argTypes), + variableArity(variableArity), + allowTypeConversion(allowTypeConversion) {} UdfSignature( std::string name, std::string returnType, std::string argTypes, std::string intermediateType, - bool variableArity) + bool variableArity, + bool allowTypeConversion) : name(name), returnType(returnType), argTypes(argTypes), intermediateType(intermediateType), - variableArity(variableArity) {} + variableArity(variableArity), + allowTypeConversion(allowTypeConversion) {} ~UdfSignature() = default; }; diff --git a/cpp/velox/udf/examples/MyUDF.cc b/cpp/velox/udf/examples/MyUDF.cc index ee20ca39d0264..db1c5d7709f01 100644 --- a/cpp/velox/udf/examples/MyUDF.cc +++ b/cpp/velox/udf/examples/MyUDF.cc @@ -222,6 +222,30 @@ class MyDateRegisterer final : public gluten::UdfRegisterer { const std::string name_ = "mydate"; const char* myDateArg_[2] = {kDate, kInteger}; }; + +// name: mydate +// signatures: +// date, integer -> bigint +// type: SimpleFunction +// enable type conversion +class MyDate2Registerer final : public gluten::UdfRegisterer { + public: + int getNumUdf() override { + return 1; + } + + void populateUdfEntries(int& index, gluten::UdfEntry* udfEntries) override { + udfEntries[index++] = {name_.c_str(), kDate, 2, myDateArg_, false, true}; + } + + void registerSignatures() override { + facebook::velox::registerFunction({name_}); + } + + private: + const std::string name_ = "mydate2"; + const char* myDateArg_[2] = {kDate, kInteger}; +}; } // namespace mydate std::vector>& globalRegisters() { @@ -239,6 +263,7 @@ void setupRegisterers() { registerers.push_back(std::make_shared()); registerers.push_back(std::make_shared()); registerers.push_back(std::make_shared()); + registerers.push_back(std::make_shared()); inited = true; } } // namespace diff --git a/cpp/velox/utils/ConfigExtractor.cc b/cpp/velox/utils/ConfigExtractor.cc index 816166351c0e8..4189df758f390 100644 --- a/cpp/velox/utils/ConfigExtractor.cc +++ b/cpp/velox/utils/ConfigExtractor.cc @@ -59,7 +59,8 @@ std::string getConfigValue( return got->second; } -std::shared_ptr getHiveConfig(std::shared_ptr conf) { +std::shared_ptr getHiveConfig( + std::shared_ptr conf) { std::unordered_map hiveConfMap; #ifdef ENABLE_S3 @@ -125,7 +126,7 @@ std::shared_ptr getHiveConfig(std::shared_ptr< #ifdef ENABLE_GCS // https://github.com/GoogleCloudDataproc/hadoop-connectors/blob/master/gcs/CONFIGURATION.md#api-client-configuration - auto gsStorageRootUrl = conf->get("spark.hadoop.fs.gs.storage.root.url"); + auto gsStorageRootUrl = conf->get("spark.hadoop.fs.gs.storage.root.url"); if (gsStorageRootUrl.hasValue()) { std::string url = gsStorageRootUrl.value(); std::string gcsScheme; @@ -146,23 +147,24 @@ std::shared_ptr getHiveConfig(std::shared_ptr< // https://github.com/GoogleCloudDataproc/hadoop-connectors/blob/master/gcs/CONFIGURATION.md#http-transport-configuration // https://cloud.google.com/cpp/docs/reference/storage/latest/classgoogle_1_1cloud_1_1storage_1_1LimitedErrorCountRetryPolicy - auto gsMaxRetryCount = conf->get("spark.hadoop.fs.gs.http.max.retry"); + auto gsMaxRetryCount = conf->get("spark.hadoop.fs.gs.http.max.retry"); if (gsMaxRetryCount.hasValue()) { hiveConfMap[facebook::velox::connector::hive::HiveConfig::kGCSMaxRetryCount] = gsMaxRetryCount.value(); } // https://cloud.google.com/cpp/docs/reference/storage/latest/classgoogle_1_1cloud_1_1storage_1_1LimitedTimeRetryPolicy - auto gsMaxRetryTime = conf->get("spark.hadoop.fs.gs.http.max.retry-time"); + auto gsMaxRetryTime = conf->get("spark.hadoop.fs.gs.http.max.retry-time"); if (gsMaxRetryTime.hasValue()) { hiveConfMap[facebook::velox::connector::hive::HiveConfig::kGCSMaxRetryTime] = gsMaxRetryTime.value(); } // https://github.com/GoogleCloudDataproc/hadoop-connectors/blob/master/gcs/CONFIGURATION.md#authentication - auto gsAuthType = conf->get("spark.hadoop.fs.gs.auth.type"); + auto gsAuthType = conf->get("spark.hadoop.fs.gs.auth.type"); if (gsAuthType.hasValue()) { std::string type = gsAuthType.value(); if (type == "SERVICE_ACCOUNT_JSON_KEYFILE") { - auto gsAuthServiceAccountJsonKeyfile = conf->get("spark.hadoop.fs.gs.auth.service.account.json.keyfile"); + auto gsAuthServiceAccountJsonKeyfile = + conf->get("spark.hadoop.fs.gs.auth.service.account.json.keyfile"); if (gsAuthServiceAccountJsonKeyfile.hasValue()) { auto stream = std::ifstream(gsAuthServiceAccountJsonKeyfile.value()); stream.exceptions(std::ios::badbit); @@ -180,7 +182,7 @@ std::shared_ptr getHiveConfig(std::shared_ptr< hiveConfMap[facebook::velox::connector::hive::HiveConfig::kEnableFileHandleCache] = conf->get(kVeloxFileHandleCacheEnabled, kVeloxFileHandleCacheEnabledDefault) ? "true" : "false"; - return std::make_shared(std::move(hiveConfMap)); + return std::make_shared(std::move(hiveConfMap)); } } // namespace gluten diff --git a/cpp/velox/utils/ConfigExtractor.h b/cpp/velox/utils/ConfigExtractor.h index c5f662c950de3..4cbfdf991f420 100644 --- a/cpp/velox/utils/ConfigExtractor.h +++ b/cpp/velox/utils/ConfigExtractor.h @@ -24,7 +24,7 @@ #include #include "config/GlutenConfig.h" -#include "velox/core/Config.h" +#include "velox/common/config/Config.h" namespace gluten { @@ -33,6 +33,7 @@ std::string getConfigValue( const std::string& key, const std::optional& fallbackValue); -std::shared_ptr getHiveConfig(std::shared_ptr conf); +std::shared_ptr getHiveConfig( + std::shared_ptr conf); } // namespace gluten diff --git a/cpp/velox/utils/HdfsUtils.cc b/cpp/velox/utils/HdfsUtils.cc index a912c04eee7e7..1bc326d4ca487 100644 --- a/cpp/velox/utils/HdfsUtils.cc +++ b/cpp/velox/utils/HdfsUtils.cc @@ -36,7 +36,7 @@ struct Credential { }; } // namespace -void updateHdfsTokens(const facebook::velox::Config* veloxCfg) { +void updateHdfsTokens(const facebook::velox::config::ConfigBase* veloxCfg) { static std::mutex mtx; std::lock_guard lock{mtx}; diff --git a/cpp/velox/utils/HdfsUtils.h b/cpp/velox/utils/HdfsUtils.h index cd017f250ad22..2e07d7ddf41bf 100644 --- a/cpp/velox/utils/HdfsUtils.h +++ b/cpp/velox/utils/HdfsUtils.h @@ -15,8 +15,8 @@ * limitations under the License. */ -#include +#include #include namespace gluten { -void updateHdfsTokens(const facebook::velox::Config* veloxCfg); +void updateHdfsTokens(const facebook::velox::config::ConfigBase* veloxCfg); } diff --git a/cpp/velox/utils/VeloxBatchResizer.cc b/cpp/velox/utils/VeloxBatchResizer.cc index 7b51463068c94..56429299464ac 100644 --- a/cpp/velox/utils/VeloxBatchResizer.cc +++ b/cpp/velox/utils/VeloxBatchResizer.cc @@ -23,9 +23,7 @@ namespace { class SliceRowVector : public ColumnarBatchIterator { public: SliceRowVector(int32_t maxOutputBatchSize, facebook::velox::RowVectorPtr in) - : maxOutputBatchSize_(maxOutputBatchSize), in_(in) { - GLUTEN_CHECK(in->size() > maxOutputBatchSize, "Invalid state"); - } + : maxOutputBatchSize_(maxOutputBatchSize), in_(in) {} std::shared_ptr next() override { int32_t remainingLength = in_->size() - cursor_; @@ -55,7 +53,11 @@ gluten::VeloxBatchResizer::VeloxBatchResizer( : pool_(pool), minOutputBatchSize_(minOutputBatchSize), maxOutputBatchSize_(maxOutputBatchSize), - in_(std::move(in)) {} + in_(std::move(in)) { + GLUTEN_CHECK( + minOutputBatchSize_ > 0 && maxOutputBatchSize_ > 0, + "Either minOutputBatchSize or maxOutputBatchSize should be larger than 0"); +} std::shared_ptr VeloxBatchResizer::next() { if (next_) { @@ -82,6 +84,11 @@ std::shared_ptr VeloxBatchResizer::next() { for (auto nextCb = in_->next(); nextCb != nullptr; nextCb = in_->next()) { auto nextVb = VeloxColumnarBatch::from(pool_, nextCb); auto nextRv = nextVb->getRowVector(); + if (buffer->size() + nextRv->size() > maxOutputBatchSize_) { + GLUTEN_CHECK(next_ == nullptr, "Invalid state"); + next_ = std::make_unique(maxOutputBatchSize_, nextRv); + return std::make_shared(buffer); + } buffer->append(nextRv.get()); if (buffer->size() >= minOutputBatchSize_) { // Buffer is full. diff --git a/cpp/velox/utils/tests/VeloxShuffleWriterTestBase.h b/cpp/velox/utils/tests/VeloxShuffleWriterTestBase.h index d32e3272186b5..102c73ca49fab 100644 --- a/cpp/velox/utils/tests/VeloxShuffleWriterTestBase.h +++ b/cpp/velox/utils/tests/VeloxShuffleWriterTestBase.h @@ -196,9 +196,12 @@ class VeloxShuffleWriterTestBase : public facebook::velox::test::VectorTestBase inputVectorComplex_ = makeRowVector(childrenComplex_); } - arrow::Status splitRowVector(VeloxShuffleWriter& shuffleWriter, facebook::velox::RowVectorPtr vector) { + arrow::Status splitRowVector( + VeloxShuffleWriter& shuffleWriter, + facebook::velox::RowVectorPtr vector, + int64_t memLimit = ShuffleWriter::kMinMemLimit) { std::shared_ptr cb = std::make_shared(vector); - return shuffleWriter.write(cb, ShuffleWriter::kMinMemLimit); + return shuffleWriter.write(cb, memLimit); } // Create multiple local dirs and join with comma. @@ -533,7 +536,7 @@ class RoundRobinPartitioningShuffleWriter : public MultiplePartitioningShuffleWr } }; -class VeloxShuffleWriterMemoryTest : public VeloxShuffleWriterTestBase, public testing::Test { +class VeloxHashShuffleWriterMemoryTest : public VeloxShuffleWriterTestBase, public testing::Test { protected: static void SetUpTestCase() { facebook::velox::memory::MemoryManager::testingSetInstance({}); diff --git a/dev/build_arrow.sh b/dev/build_arrow.sh index b914a9fce48b3..e7496350f988c 100755 --- a/dev/build_arrow.sh +++ b/dev/build_arrow.sh @@ -14,6 +14,8 @@ # See the License for the specific language governing permissions and # limitations under the License. +set -exu + CURRENT_DIR=$(cd "$(dirname "$BASH_SOURCE")"; pwd) export SUDO=sudo source ${CURRENT_DIR}/build_helper_functions.sh @@ -30,13 +32,6 @@ function prepare_arrow_build() { popd } -function install_arrow_deps { - wget_and_untar https://github.com/openssl/openssl/archive/refs/tags/OpenSSL_1_1_1s.tar.gz openssl - pushd openssl - ./config no-shared && make depend && make && sudo make install - popd -} - function build_arrow_cpp() { pushd $ARROW_PREFIX/cpp diff --git a/dev/ci-velox-buildshared-centos-8.sh b/dev/ci-velox-buildshared-centos-8.sh index 362900bd009ac..b6b0cda02d289 100755 --- a/dev/ci-velox-buildshared-centos-8.sh +++ b/dev/ci-velox-buildshared-centos-8.sh @@ -2,17 +2,6 @@ set -e -sed -i -e "s|mirrorlist=|#mirrorlist=|g" /etc/yum.repos.d/CentOS-* || true -sed -i -e "s|#baseurl=http://mirror.centos.org|baseurl=http://vault.centos.org|g" /etc/yum.repos.d/CentOS-* || true - -yum install sudo patch java-1.8.0-openjdk-devel wget -y -# Required by building arrow java. -wget https://downloads.apache.org/maven/maven-3/3.8.8/binaries/apache-maven-3.8.8-bin.tar.gz -tar -xvf apache-maven-3.8.8-bin.tar.gz && mv apache-maven-3.8.8 /usr/lib/maven -export PATH="${PATH}:/usr/lib/maven/bin" - source /opt/rh/gcc-toolset-9/enable ./dev/builddeps-veloxbe.sh --run_setup_script=OFF --enable_ep_cache=OFF --build_tests=ON \ --build_examples=ON --build_benchmarks=ON --build_protobuf=ON - -cd ./cpp/build && ctest -V diff --git a/dev/ci-velox-buildstatic-centos-7.sh b/dev/ci-velox-buildstatic-centos-7.sh index 50f8ee06763e1..3272de95d910d 100755 --- a/dev/ci-velox-buildstatic-centos-7.sh +++ b/dev/ci-velox-buildstatic-centos-7.sh @@ -2,12 +2,7 @@ set -e -yum install sudo patch java-1.8.0-openjdk-devel -y -cd $GITHUB_WORKSPACE/ep/build-velox/src -./get_velox.sh source /opt/rh/devtoolset-9/enable -cd $GITHUB_WORKSPACE/ -source ./dev/vcpkg/env.sh -sed -i '/^headers/d' ep/build-velox/build/velox_ep/CMakeLists.txt export NUM_THREADS=4 -./dev/builddeps-veloxbe.sh --build_tests=OFF --build_benchmarks=OFF --enable_s3=ON --enable_gcs=ON --enable_hdfs=ON --enable_abfs=ON +./dev/builddeps-veloxbe.sh --enable_vcpkg=ON --build_arrow=OFF --build_tests=OFF --build_benchmarks=OFF \ + --build_examples=OFF --enable_s3=ON --enable_gcs=ON --enable_hdfs=ON --enable_abfs=ON diff --git a/dev/package.sh b/dev/package.sh index 6da750b44476d..a26f94d9b6b81 100755 --- a/dev/package.sh +++ b/dev/package.sh @@ -9,13 +9,6 @@ VERSION=$(. /etc/os-release && echo ${VERSION_ID}) ARCH=`uname -m` cd "$GLUTEN_DIR" -if [ "$LINUX_OS" == "centos" ]; then - if [ "$VERSION" == "8" ]; then - source /opt/rh/gcc-toolset-9/enable - elif [ "$VERSION" == "7" ]; then - source /opt/rh/devtoolset-9/enable - fi -fi # build gluten with velox backend, prompt always respond y export PROMPT_ALWAYS_RESPOND=y diff --git a/dev/vcpkg/Makefile b/dev/vcpkg/Makefile index 11cd02d8819be..874b0b5798962 100644 --- a/dev/vcpkg/Makefile +++ b/dev/vcpkg/Makefile @@ -1,4 +1,4 @@ -DOCKER_IMAGE=apache/gluten:gluten-vcpkg-builder_2024_05_22 +DOCKER_IMAGE=apache/gluten:gluten-vcpkg-builder GLUTEN_REPO=$(shell realpath -L ../..) CCACHE_DIR=$(HOME)/.ccache @@ -33,6 +33,8 @@ docker-image-gha: docker build \ --file docker/Dockerfile.gha \ --tag "$(DOCKER_IMAGE)" \ + --build-arg HTTPS_PROXY="" \ + --build-arg HTTP_PROXY="" \ . diff --git a/dev/vcpkg/docker/Dockerfile.gha b/dev/vcpkg/docker/Dockerfile.gha index 6a297c4431cea..d37c449cb1ffa 100644 --- a/dev/vcpkg/docker/Dockerfile.gha +++ b/dev/vcpkg/docker/Dockerfile.gha @@ -1,14 +1,30 @@ FROM centos:7 -RUN yum install -y git patch wget sudo +RUN sed -i -e "s|mirrorlist=|#mirrorlist=|g" /etc/yum.repos.d/CentOS-* || true +RUN sed -i -e "s|#baseurl=http://mirror.centos.org|baseurl=http://vault.centos.org|g" /etc/yum.repos.d/CentOS-* || true + +RUN yum install -y centos-release-scl +RUN rm /etc/yum.repos.d/CentOS-SCLo-scl.repo -f +RUN sed -i \ + -e 's/^mirrorlist/#mirrorlist/' \ + -e 's/^#baseurl/baseurl/' \ + -e 's/mirror\.centos\.org/vault.centos.org/' \ + /etc/yum.repos.d/CentOS-SCLo-scl-rh.repo + +RUN yum install -y git patch wget sudo java-1.8.0-openjdk-devel -# build RUN git clone --depth=1 https://github.com/apache/incubator-gluten /opt/gluten RUN echo "check_certificate = off" >> ~/.wgetrc -# deps + RUN cd /opt/gluten && bash ./dev/vcpkg/setup-build-depends.sh -# vcpkg env +# An actual path used for vcpkg cache. +RUN mkdir -p /var/cache/vcpkg + +# Set vcpkg cache path. ENV VCPKG_BINARY_SOURCES=clear;files,/var/cache/vcpkg,readwrite -RUN source /opt/rh/devtoolset-9/enable && cd /opt/gluten && source dev/vcpkg/env.sh + +# Build arrow, then install the native libs to system paths and jar package to .m2/ directory. +RUN cd /opt/gluten && source /opt/rh/devtoolset-9/enable && source ./dev/vcpkg/env.sh && \ + bash ./dev/builddeps-veloxbe.sh build_arrow && rm -rf /opt/gluten diff --git a/dev/vcpkg/env.sh b/dev/vcpkg/env.sh index cf3126f0a1b9e..8b247a907a059 100755 --- a/dev/vcpkg/env.sh +++ b/dev/vcpkg/env.sh @@ -1,4 +1,5 @@ #! /bin/bash +set -e if [ -z "${BASH_SOURCE[0]}" ] || [ "$0" == "${BASH_SOURCE[0]}" ]; then echo "env.sh should only be sourced in bash" >&2 diff --git a/dev/vcpkg/ports/simdjson/vcpkg.json b/dev/vcpkg/ports/simdjson/vcpkg.json index 6e46382e42f8f..2b74be554cfc2 100644 --- a/dev/vcpkg/ports/simdjson/vcpkg.json +++ b/dev/vcpkg/ports/simdjson/vcpkg.json @@ -17,8 +17,7 @@ "default-features": [ "deprecated", "exceptions", - "threads", - "utf8-validation" + "threads" ], "features": { "deprecated": { diff --git a/dev/vcpkg/setup-build-depends.sh b/dev/vcpkg/setup-build-depends.sh index da104b6df8e49..ca272a8318d54 100755 --- a/dev/vcpkg/setup-build-depends.sh +++ b/dev/vcpkg/setup-build-depends.sh @@ -184,7 +184,7 @@ install_ubuntu_18.04() { } install_ubuntu_20.04() { - apt-get -y install \ + apt-get update && apt-get -y install \ wget curl tar zip unzip git \ build-essential ccache cmake ninja-build pkg-config autoconf autoconf-archive libtool \ flex bison \ diff --git a/docs/developers/MicroBenchmarks.md b/docs/developers/MicroBenchmarks.md index 21f222b42690d..bd469f34c81cc 100644 --- a/docs/developers/MicroBenchmarks.md +++ b/docs/developers/MicroBenchmarks.md @@ -320,23 +320,44 @@ cd /path/to/gluten/cpp/build/velox/benchmarks --threads 1 ``` -### Run shuffle write task only +### Run shuffle write/read task only Developers can only run shuffle write task via specifying `--run-shuffle` and `--data` options. The parquet format input will be read from arrow-parquet reader and sent to shuffle writer. -This option is similar to the `--with-shuffle` option, but it doesn't require the plan and split files. +The `--run-shuffle` option is similar to the `--with-shuffle` option, but it doesn't require the plan and split files. The round-robin partitioner is used by default. Besides, random partitioning can be used for testing purpose. By specifying option `--partitioning random`, the partitioner will generate a random partition id for each row. +To evaluate the shuffle reader performance, developers can set `--run-shuffle-read` option to add read process after the write task finishes. + +The below command will run shuffle write/read in single thread, using sort shuffle writer with 40000 partitions and random partition id. ```shell cd /path/to/gluten/cpp/build/velox/benchmarks ./generic_benchmark \ --run-shuffle \ +--run-shuffle-read \ --data /path/to/input_for_shuffle_write.parquet --shuffle-writer sort \ +--partitioning random \ +--shuffle-partitions 40000 \ --threads 1 ``` +The output should be like: + +``` +------------------------------------------------------------------------------------------------------------------------- +Benchmark Time CPU Iterations UserCounters... +------------------------------------------------------------------------------------------------------------------------- +ShuffleWriteRead/iterations:1/process_time/real_time/threads:1 121637629714 ns 121309450910 ns 1 elapsed_time=121.638G read_input_time=25.2637G shuffle_compress_time=10.8311G shuffle_decompress_time=4.04055G shuffle_deserialize_time=7.24289G shuffle_spill_time=0 shuffle_split_time=69.9098G shuffle_write_time=2.03274G +``` + +## Enable debug mode + +`spark.gluten.sql.debug`(debug mode) is set to false by default thereby the google glog levels are limited to only print `WARNING` or higher severity logs. +Unless `spark.gluten.sql.debug` is set in the INI file via `--conf`, the logging behavior is same as debug mode off. +Developers can use `--debug-mode` command line flag to turn on debug mode when needed, and set verbosity/severity level via command line flags `--v` and `--minloglevel`. Note that constructing and deconstructing log strings can be very time-consuming, which may cause benchmark times to be inaccurate. + ## Simulate write tasks The last operator for a write task is a file write operator, and the output from Velox pipeline only diff --git a/docs/developers/UsingGperftoolsInCH.md b/docs/developers/UsingGperftoolsInCH.md index f0d5c720b30b5..5a4bbea3fbbc5 100644 --- a/docs/developers/UsingGperftoolsInCH.md +++ b/docs/developers/UsingGperftoolsInCH.md @@ -1,3 +1,10 @@ +--- +layout: page +title: Debug for CH Backend with gpertools Tool +nav_order: 11 +has_children: true +parent: /developer-overview/ +--- We need using gpertools to find the memory or CPU issue. That's what this document is about. ## Install gperftools diff --git a/docs/developers/UsingJemallocWithCH.md b/docs/developers/UsingJemallocWithCH.md index 626f7522d7c8c..365a35dd39fee 100644 --- a/docs/developers/UsingJemallocWithCH.md +++ b/docs/developers/UsingJemallocWithCH.md @@ -1,3 +1,10 @@ +--- +layout: page +title: Use Jemalloc for CH Backend +nav_order: 12 +has_children: true +parent: /developer-overview/ +--- We need using jemalloc to find the memory issue. That's what this document is about. ## Change code of jemalloc diff --git a/docs/developers/VeloxUDF.md b/docs/developers/VeloxUDF.md index c896fd6726573..6872f2d0c8419 100644 --- a/docs/developers/VeloxUDF.md +++ b/docs/developers/VeloxUDF.md @@ -1,3 +1,10 @@ +--- +layout: page +title: Velox UDF and UDAF +nav_order: 13 +has_children: true +parent: /developer-overview/ +--- # Velox User-Defined Functions (UDF) and User-Defined Aggregate Functions (UDAF) ## Introduction @@ -21,18 +28,18 @@ The following steps demonstrate how to set up a UDF library project: - **Implement the Interface Functions:** Implement the following interface functions that integrate UDF into Project Gluten: - - `getNumUdf()`: - This function should return the number of UDF in the library. - This is used to allocating udfEntries array as the argument for the next function `getUdfEntries`. + - `getNumUdf()`: + This function should return the number of UDF in the library. + This is used to allocating udfEntries array as the argument for the next function `getUdfEntries`. - - `getUdfEntries(gluten::UdfEntry* udfEntries)`: - This function should populate the provided udfEntries array with the details of the UDF, including function names and signatures. + - `getUdfEntries(gluten::UdfEntry* udfEntries)`: + This function should populate the provided udfEntries array with the details of the UDF, including function names and signatures. - - `registerUdf()`: - This function is called to register the UDF to Velox function registry. - This is where users should register functions by calling `facebook::velox::exec::registerVecotorFunction` or other Velox APIs. + - `registerUdf()`: + This function is called to register the UDF to Velox function registry. + This is where users should register functions by calling `facebook::velox::exec::registerVecotorFunction` or other Velox APIs. - - The interface functions are mapped to marcos in [Udf.h](../../cpp/velox/udf/Udf.h). Here's an example of how to implement these functions: + - The interface functions are mapped to marcos in [Udf.h](../../cpp/velox/udf/Udf.h). Here's an example of how to implement these functions: ``` // Filename MyUDF.cc @@ -176,6 +183,14 @@ The output from spark-shell will be like +------------------+----------------+ ``` +## Configurations + +| Parameters | Description | +|----------------------------------------------------------------|-------------------------------------------------------------------------------------------------------------| +| spark.gluten.sql.columnar.backend.velox.udfLibraryPaths | Path to the udf/udaf libraries. | +| spark.gluten.sql.columnar.backend.velox.driver.udfLibraryPaths | Path to the udf/udaf libraries on driver node. Only applicable on yarn-client mode. | +| spark.gluten.sql.columnar.backend.velox.udfAllowTypeConversion | Whether to inject possible `cast` to convert mismatched data types from input to one registered signatures. | + # Pandas UDFs (a.k.a. Vectorized UDFs) ## Introduction diff --git a/docs/developers/velox-backend-CI.md b/docs/developers/velox-backend-CI.md new file mode 100644 index 0000000000000..e4dceffdf72e5 --- /dev/null +++ b/docs/developers/velox-backend-CI.md @@ -0,0 +1,24 @@ +--- +layout: page +title: Velox Backend CI +nav_order: 14 +parent: Developer Overview +--- +# Velox Backend CI + +GHA workflows are defined under `.github/workflows/`. + +## Docker Build +We have a weekly job to build a docker based on `Dockerfile.gha` for CI verification, defined in `docker_image.yml`. + +## Vcpkg Caching +Gluten main branch is pulled down during docker build. And vcpkg will cache binary data of all dependencies defined under dev/vcpkg. +These binary data is cached into `/var/cache/vcpkg` and CI job can re-use them in new build. By setting `VCPKG_BINARY_SOURCES=clear` in env., +reusing vcpkg cache can be disabled. + +## Arrow Libs Pre-installation +Arrow libs are pre-installed in docker, assuming they are not actively changed, then not necessarily to be re-built every time. + +## Updating Docker Image +Two GitHub secrets `DOCKERHUB_USER` & `DOCKERHUB_TOKEN` can be used to push docker image to docker hub: https://hub.docker.com/r/apache/gluten/tags. +Note GitHub secrets are not retrievable in PR from forked repo. \ No newline at end of file diff --git a/docs/get-started/Velox.md b/docs/get-started/Velox.md index 63239c39931ef..776736b2fff5b 100644 --- a/docs/get-started/Velox.md +++ b/docs/get-started/Velox.md @@ -230,7 +230,7 @@ mvn clean package -Pbackends-velox -Pspark-3.3 -Pceleborn -DskipTests Then add the Gluten and Spark Celeborn Client packages to your Spark application's classpath(usually add them into `$SPARK_HOME/jars`). - Celeborn: celeborn-client-spark-3-shaded_2.12-[celebornVersion].jar -- Gluten: gluten-velox-bundle-spark3.x_2.12-xx_xx_xx-SNAPSHOT.jar, gluten-celeborn-package-xx-SNAPSHOT.jar +- Gluten: gluten-velox-bundle-spark3.x_2.12-xx_xx_xx-SNAPSHOT.jar (The bundled Gluten Jar. Make sure -Pceleborn is specified when it is built.) Currently to use Gluten following configurations are required in `spark-defaults.conf` @@ -279,7 +279,7 @@ mvn clean package -Pbackends-velox -Pspark-3.3 -Puniffle -DskipTests Then add the Uniffle and Spark Celeborn Client packages to your Spark application's classpath(usually add them into `$SPARK_HOME/jars`). - Uniffle: rss-client-spark3-shaded-[uniffleVersion].jar -- Gluten: gluten-uniffle-velox-xxx-SNAPSHOT-3.x.jar +- Gluten: gluten-velox-bundle-spark3.x_2.12-xx_xx_xx-SNAPSHOT.jar (The bundled Gluten Jar. Make sure -Puniffle is specified when it is built.) Currently to use Gluten following configurations are required in `spark-defaults.conf` @@ -298,7 +298,7 @@ spark.shuffle.service.enabled false spark.rss.storage.type LOCALFILE_HDFS # If you want to use dynamic resource allocation, -# please refer to this URL (https://github.com/apache/incubator-uniffle/tree/master/patch/spark) to apply the patch into your own Spark. +# please refer to this URL (https://uniffle.apache.org/docs/client-guide#support-spark-dynamic-allocation) for more details. spark.dynamicAllocation.enabled false ``` @@ -314,10 +314,7 @@ First of all, compile gluten-delta module by a `delta` profile, as follows: mvn clean package -Pbackends-velox -Pspark-3.3 -Pdelta -DskipTests ``` -Then, put the additional `gluten-delta-XX-SNAPSHOT.jar` to the class path (usually it's `$SPARK_HOME/jars`). -The gluten-delta jar is in `gluten-delta/target` directory. - -After the two steps, you can query delta table by gluten/velox without scan's fallback. +Once built successfully, delta features will be included in gluten-velox-bundle-X jar. Then you can query delta table by gluten/velox without scan's fallback. Gluten with velox backends also support the column mapping of delta tables. About column mapping, see more [here](https://docs.delta.io/latest/delta-column-mapping.html). @@ -336,8 +333,6 @@ mvn clean package -Pbackends-velox -Pspark-3.3 -Piceberg -DskipTests Once built successfully, iceberg features will be included in gluten-velox-bundle-X jar. Then you can query iceberg table by gluten/velox without scan's fallback. -After the two steps, you can query iceberg table by gluten/velox without scan's fallback. - # Coverage Spark3.3 has 387 functions in total. ~240 are commonly used. To get the support status of all Spark built-in functions, please refer to [Velox Backend's Supported Operators & Functions](../velox-backend-support-progress.md). diff --git a/docs/velox-backend-support-progress.md b/docs/velox-backend-support-progress.md index 0f79c83e59f29..aa2e9c999dd47 100644 --- a/docs/velox-backend-support-progress.md +++ b/docs/velox-backend-support-progress.md @@ -92,6 +92,34 @@ Gluten supports 28 operators (Drag to right to see all data types) Gluten supports 199 functions. (Drag to right to see all data types) +#### Cast function's support status + + * S: supported. + * NS: not supported. + * -: not accepted by Spark. + * N/A: not applicable case, e.g., from type is as same as to type, where cast will not actually happen. + +| From \ To | BOOLEAN | BYTE | SHORT | INT | LONG | FLOAT | DOUBLE | DECIMAL | DATE | TIMESTAMP | STRING | BINARY | ARRAY | MAP | STRUCT | NULL | +|-----------|---------|------|-------|-----|------|-------|--------|---------|------|-----------|--------|--------|-------|-----|--------|------| +| BOOLEAN | N/A | S | S | S | S | S | S | S | - | NS | S | - | - | - | - | - | +| BYTE | S | N/A | S | S | S | S | S | S | - | NS | S | S | - | - | - | - | +| SHORT | S | S | N/A | S | S | S | S | S | - | NS | S | S | - | - | - | - | +| INT | S | S | S | N/A | S | S | S | S | - | NS | S | S | - | - | - | - | +| LONG | S | S | S | S | N/A | S | S | S | - | NS | S | S | - | - | - | - | +| FLOAT | S | S | S | S | S | N/A | S | S | - | NS | S | - | - | - | - | - | +| DOUBLE | S | S | S | S | S | S | N/A | S | - | NS | S | - | - | - | - | - | +| DECIMAL | S | S | S | S | S | S | S | N/A | - | NS | S | - | - | - | - | - | +| DATE | NS | NS | NS | NS | NS | NS | NS | NS | N/A | NS | NS | - | - | - | - | - | +| TIMESTAMP | NS | NS | NS | NS | NS | NS | NS | NS | NS | N/A | NS | - | - | - | - | - | +| STRING | S | S | S | S | S | S | S | S | NS | NS | N/A | - | - | - | - | - | +| BINARY | S | S | S | S | S | S | S | S | NS | NS | S | N/A | - | - | - | - | +| ARRAY | - | - | - | - | - | - | - | - | - | - | NS | - | N/A | - | - | - | +| Map | - | - | - | - | - | - | - | - | - | - | NS | - | - | N/A | - | - | +| STRUCT | - | - | - | - | - | - | - | - | - | - | NS | - | - | - | N/A | - | +| NULL | S | S | S | S | S | S | S | S | S | NS | S | S | S | S | S | N/A | + +#### Other functions' support status + | Spark Functions | Velox/Presto Functions | Velox/Spark functions | Gluten | Restrictions | BOOLEAN | BYTE | SHORT | INT | LONG | FLOAT | DOUBLE | DATE | TIMESTAMP | STRING | DECIMAL | NULL | BINARY | CALENDAR | ARRAY | MAP | STRUCT | UDT | |------------------------------|------------------------|-----------------------|--------|--------------------------|---------|------|-------|-----|------|-------|--------|------|-----------|--------|---------|------|--------|----------|-------|-----|--------|-----| | ! | | not | S | | S | S | S | S | S | S | S | | | S | | | | | | | | | @@ -180,7 +208,7 @@ Gluten supports 199 functions. (Drag to right to see all data types) | sentences | | | | | | | | | | | | | | | | | | | | | | | | soundex | | soundex | S | | | | | | | | | | | | | | | | | | | | | space | | | | | | | | | | | | | | | | | | | | | | | -| split | split | split | S | Mismatched | | | | | | | | | | | | | | | | | | | +| split | split | split | S | | | | | | | | | | | | | | | | | | | | | split_part | split_part | | | Mismatched | | | | | | | | | | | | | | | | | | | | startswith | | startsWith | | | | | | | | | | | | | | | | | | | | | | substr, substring | substr | substring | S | | | | | | | | | | | S | | | | | | | | | @@ -292,6 +320,7 @@ Gluten supports 199 functions. (Drag to right to see all data types) | map_keys | map_keys | map_keys | S | | | | | | | | | | | | | | | | | | | | | map_values | map_values | map_values | S | | | | | | | | | | | | | | | | | S | | | | map_zip_with | map_zip_with | | S | | | | | | | | | | | | | | | | | S | | | +| mask | | mask | S | | | | | | | | | | | | | | | | | | | | | named_struct,struct | row_construct | named_struct | S | | | | | | | | | | | | | | | | | | S | | | posexplode_outer,posexplode | | | | | | | | | | | | | | | | | | | | | | | | sequence | | | | | | | | | | | | | | | | | | | | | | | diff --git a/ep/build-velox/src/build_velox.sh b/ep/build-velox/src/build_velox.sh index 873d3638cf14b..b48f28a374b2f 100755 --- a/ep/build-velox/src/build_velox.sh +++ b/ep/build-velox/src/build_velox.sh @@ -97,7 +97,7 @@ function compile { set -exu CXX_FLAGS='-Wno-missing-field-initializers' - COMPILE_OPTION="-DCMAKE_CXX_FLAGS=\"$CXX_FLAGS\" -DVELOX_ENABLE_PARQUET=ON -DVELOX_BUILD_TESTING=OFF" + COMPILE_OPTION="-DCMAKE_CXX_FLAGS=\"$CXX_FLAGS\" -DVELOX_ENABLE_PARQUET=ON -DVELOX_BUILD_TESTING=OFF -DVELOX_MONO_LIBRARY=ON" if [ $BUILD_TEST_UTILS == "ON" ]; then COMPILE_OPTION="$COMPILE_OPTION -DVELOX_BUILD_TEST_UTILS=ON" fi diff --git a/ep/build-velox/src/get_velox.sh b/ep/build-velox/src/get_velox.sh index 90f5e895df206..0d7ee5f3539c5 100755 --- a/ep/build-velox/src/get_velox.sh +++ b/ep/build-velox/src/get_velox.sh @@ -17,7 +17,7 @@ set -exu VELOX_REPO=https://github.com/oap-project/velox.git -VELOX_BRANCH=2024_08_06 +VELOX_BRANCH=2024_08_22 VELOX_HOME="" OS=`uname -s` @@ -88,7 +88,7 @@ function process_setup_ubuntu { sed -i '/ccache /a\ yasm \\' scripts/setup-ubuntu.sh ensure_pattern_matched 'run_and_time install_conda' scripts/setup-ubuntu.sh sed -i '/run_and_time install_conda/d' scripts/setup-ubuntu.sh - # Just depends on Gluten to install arrow libs since Gluten will apply some patches to Arrow source and uses different build options. + # Just depends on Gluten to install arrow libs since Gluten requires some patches are applied and some different build options are used. ensure_pattern_matched 'run_and_time install_arrow' scripts/setup-ubuntu.sh sed -i '/run_and_time install_arrow/d' scripts/setup-ubuntu.sh } @@ -116,6 +116,10 @@ function process_setup_centos9 { # Required by lib hdfs. ensure_pattern_matched 'dnf_install ninja-build' scripts/setup-centos9.sh sed -i '/^ dnf_install ninja-build/a\ dnf_install yasm\' scripts/setup-centos9.sh + + # Just depends on Gluten to install arrow libs since Gluten requires some patches are applied and some different build options are used. + ensure_pattern_matched 'run_and_time install_arrow' scripts/setup-centos9.sh + sed -i '/run_and_time install_arrow/d' scripts/setup-centos9.sh } function process_setup_alinux3 { diff --git a/ep/build-velox/src/modify_velox.patch b/ep/build-velox/src/modify_velox.patch index 533b493539e4c..a218fa70440e5 100644 --- a/ep/build-velox/src/modify_velox.patch +++ b/ep/build-velox/src/modify_velox.patch @@ -1,40 +1,3 @@ -diff --git a/CMake/Findlz4.cmake b/CMake/Findlz4.cmake -index d49115f12..1aaa8e532 100644 ---- a/CMake/Findlz4.cmake -+++ b/CMake/Findlz4.cmake -@@ -21,18 +21,19 @@ find_package_handle_standard_args(lz4 DEFAULT_MSG LZ4_LIBRARY LZ4_INCLUDE_DIR) - - mark_as_advanced(LZ4_LIBRARY LZ4_INCLUDE_DIR) - --get_filename_component(liblz4_ext ${LZ4_LIBRARY} EXT) --if(liblz4_ext STREQUAL ".a") -- set(liblz4_type STATIC) --else() -- set(liblz4_type SHARED) --endif() -- - if(NOT TARGET lz4::lz4) -- add_library(lz4::lz4 ${liblz4_type} IMPORTED) -- set_target_properties(lz4::lz4 PROPERTIES INTERFACE_INCLUDE_DIRECTORIES -- "${LZ4_INCLUDE_DIR}") -- set_target_properties( -- lz4::lz4 PROPERTIES IMPORTED_LINK_INTERFACE_LANGUAGES "C" -- IMPORTED_LOCATION "${LZ4_LIBRARIES}") -+ add_library(lz4::lz4 UNKNOWN IMPORTED) -+ set_target_properties(lz4::lz4 PROPERTIES -+ INTERFACE_INCLUDE_DIRECTORIES "${LZ4_INCLUDE_DIR}" -+ IMPORTED_LINK_INTERFACE_LANGUAGES "C" -+ IMPORTED_LOCATION_RELEASE "${LZ4_LIBRARY_RELEASE}") -+ set_property(TARGET lz4::lz4 APPEND PROPERTY -+ IMPORTED_CONFIGURATIONS RELEASE) -+ -+ if(LZ4_LIBRARY_DEBUG) -+ set_property(TARGET lz4::lz4 APPEND PROPERTY -+ IMPORTED_CONFIGURATIONS DEBUG) -+ set_property(TARGET lz4::lz4 PROPERTY -+ IMPORTED_LOCATION_DEBUG "${LZ4_LIBRARY_DEBUG}") -+ endif() - endif() diff --git a/CMake/resolve_dependency_modules/arrow/CMakeLists.txt b/CMake/resolve_dependency_modules/arrow/CMakeLists.txt index 56b673e87..ef48ae9d9 100644 --- a/CMake/resolve_dependency_modules/arrow/CMakeLists.txt @@ -95,8 +58,19 @@ index 56b673e87..ef48ae9d9 100644 URL_HASH ${VELOX_ARROW_BUILD_SHA256_CHECKSUM} SOURCE_SUBDIR cpp CMAKE_ARGS ${ARROW_CMAKE_ARGS} +diff --git a/CMake/resolve_dependency_modules/simdjson.cmake b/CMake/resolve_dependency_modules/simdjson.cmake +index 69e7f2044..777eb5ec1 100644 +--- a/CMake/resolve_dependency_modules/simdjson.cmake ++++ b/CMake/resolve_dependency_modules/simdjson.cmake +@@ -29,4 +29,6 @@ FetchContent_Declare( + URL ${VELOX_SIMDJSON_SOURCE_URL} + URL_HASH ${VELOX_SIMDJSON_BUILD_SHA256_CHECKSUM}) + ++set(SIMDJSON_SKIPUTF8VALIDATION ON) ++ + FetchContent_MakeAvailable(simdjson) diff --git a/CMakeLists.txt b/CMakeLists.txt -index 2dc95f972..391485879 100644 +index cc180b86c..7ca6fa727 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -236,10 +236,15 @@ if(VELOX_ENABLE_ABFS) @@ -129,19 +103,19 @@ index 2dc95f972..391485879 100644 # This is a bit convoluted, but we want to be able to use gflags::gflags as a # target even when velox is built as a subproject which uses diff --git a/velox/common/process/tests/CMakeLists.txt b/velox/common/process/tests/CMakeLists.txt -index f5e6aae72..4c4afe70b 100644 +index 23ef279c2..7e4c2f2b2 100644 --- a/velox/common/process/tests/CMakeLists.txt +++ b/velox/common/process/tests/CMakeLists.txt @@ -24,4 +24,6 @@ target_link_libraries( fmt::fmt - gtest velox_time -- gtest_main) -+ gtest_main + GTest::gtest +- GTest::gtest_main) ++ GTest::gtest_main + glog::glog + gflags::gflags) diff --git a/velox/connectors/hive/storage_adapters/abfs/RegisterAbfsFileSystem.cpp b/velox/connectors/hive/storage_adapters/abfs/RegisterAbfsFileSystem.cpp -index e2a638df6..e383cf205 100644 +index 5a566770f..8c2a48cc2 100644 --- a/velox/connectors/hive/storage_adapters/abfs/RegisterAbfsFileSystem.cpp +++ b/velox/connectors/hive/storage_adapters/abfs/RegisterAbfsFileSystem.cpp @@ -38,7 +38,6 @@ std::shared_ptr abfsFileSystemGenerator( @@ -166,12 +140,12 @@ index 10ee508ba..027a58ecc 100644 } diff --git a/velox/dwio/parquet/writer/arrow/tests/CMakeLists.txt b/velox/dwio/parquet/writer/arrow/tests/CMakeLists.txt -index 97266c253..11d88dcc4 100644 +index 0cda25430..b15565796 100644 --- a/velox/dwio/parquet/writer/arrow/tests/CMakeLists.txt +++ b/velox/dwio/parquet/writer/arrow/tests/CMakeLists.txt @@ -38,7 +38,9 @@ target_link_libraries( - gtest - gtest_main + GTest::gtest + GTest::gtest_main arrow - arrow_testing) + arrow_testing diff --git a/ep/build-velox/src/setup-centos7.sh b/ep/build-velox/src/setup-centos7.sh index 34d7bcfb65c5d..708c692ea5827 100755 --- a/ep/build-velox/src/setup-centos7.sh +++ b/ep/build-velox/src/setup-centos7.sh @@ -17,7 +17,7 @@ set -efx -o pipefail # Some of the packages must be build with the same compiler flags # so that some low level types are the same size. Also, disable warnings. -SCRIPTDIR=$(dirname "${BASH_SOURCE[0]}")/../build/velox_ep/scripts +SCRIPTDIR=./scripts source $SCRIPTDIR/setup-helper-functions.sh DEPENDENCY_DIR=${DEPENDENCY_DIR:-/tmp/velox-deps} CPU_TARGET="${CPU_TARGET:-avx}" diff --git a/ep/build-velox/src/setup-centos8.sh b/ep/build-velox/src/setup-centos8.sh index 771c2ab835dfe..4094256968e37 100755 --- a/ep/build-velox/src/setup-centos8.sh +++ b/ep/build-velox/src/setup-centos8.sh @@ -29,7 +29,7 @@ set -efx -o pipefail # Some of the packages must be build with the same compiler flags # so that some low level types are the same size. Also, disable warnings. -SCRIPTDIR=$(dirname "${BASH_SOURCE[0]}")/../build/velox_ep/scripts +SCRIPTDIR=./scripts source $SCRIPTDIR/setup-helper-functions.sh CPU_TARGET="${CPU_TARGET:-avx}" NPROC=$(getconf _NPROCESSORS_ONLN) diff --git a/gluten-celeborn/clickhouse/pom.xml b/gluten-celeborn/clickhouse/pom.xml index 284a8f57282a5..9e64e77ce6ead 100755 --- a/gluten-celeborn/clickhouse/pom.xml +++ b/gluten-celeborn/clickhouse/pom.xml @@ -148,6 +148,38 @@ ${hadoop.version} test + + org.apache.arrow + arrow-memory-core + ${arrow.version} + provided + + + io.netty + netty-common + + + io.netty + netty-buffer + + + + + org.apache.arrow + arrow-vector + ${arrow.version} + provided + + + io.netty + netty-common + + + io.netty + netty-buffer + + + diff --git a/gluten-celeborn/clickhouse/src/main/scala/org/apache/spark/shuffle/CHCelebornColumnarBatchSerializer.scala b/gluten-celeborn/clickhouse/src/main/scala/org/apache/spark/shuffle/CHCelebornColumnarBatchSerializer.scala index 3619855f74ed5..3b8e92bfe1d24 100644 --- a/gluten-celeborn/clickhouse/src/main/scala/org/apache/spark/shuffle/CHCelebornColumnarBatchSerializer.scala +++ b/gluten-celeborn/clickhouse/src/main/scala/org/apache/spark/shuffle/CHCelebornColumnarBatchSerializer.scala @@ -58,8 +58,14 @@ private class CHCelebornColumnarBatchSerializerInstance( extends SerializerInstance with Logging { - private lazy val compressionCodec = - GlutenShuffleUtils.getCompressionCodec(SparkEnv.get.conf).toUpperCase(Locale.ROOT) + private lazy val conf = SparkEnv.get.conf + private lazy val compressionCodec = GlutenShuffleUtils.getCompressionCodec(conf) + private lazy val capitalizedCompressionCodec = compressionCodec.toUpperCase(Locale.ROOT) + private lazy val compressionLevel = + GlutenShuffleUtils.getCompressionLevel( + conf, + compressionCodec, + GlutenConfig.getConf.columnarShuffleCodecBackend.orNull) override def deserializeStream(in: InputStream): DeserializationStream = { new DeserializationStream { @@ -199,7 +205,8 @@ private class CHCelebornColumnarBatchSerializerInstance( writeBuffer, dataSize, CHBackendSettings.useCustomizedShuffleCodec, - compressionCodec, + capitalizedCompressionCodec, + compressionLevel, CHBackendSettings.customizeBufferSize ) diff --git a/gluten-celeborn/clickhouse/src/main/scala/org/apache/spark/shuffle/CHCelebornColumnarShuffleWriter.scala b/gluten-celeborn/clickhouse/src/main/scala/org/apache/spark/shuffle/CHCelebornColumnarShuffleWriter.scala index c7d7957c15b6b..9b99e533f9352 100644 --- a/gluten-celeborn/clickhouse/src/main/scala/org/apache/spark/shuffle/CHCelebornColumnarShuffleWriter.scala +++ b/gluten-celeborn/clickhouse/src/main/scala/org/apache/spark/shuffle/CHCelebornColumnarShuffleWriter.scala @@ -47,7 +47,7 @@ class CHCelebornColumnarShuffleWriter[K, V]( client, writeMetrics) { - private val customizedCompressCodec = + private val capitalizedCompressionCodec = customizedCompressionCodec.toUpperCase(Locale.ROOT) private val jniWrapper = new CHShuffleSplitterJniWrapper @@ -105,7 +105,8 @@ class CHCelebornColumnarShuffleWriter[K, V]( shuffleId, mapId, nativeBufferSize, - customizedCompressCodec, + capitalizedCompressionCodec, + compressionLevel, GlutenConfig.getConf.chColumnarShuffleSpillThreshold, CHBackendSettings.shuffleHashAlgorithm, celebornPartitionPusher, diff --git a/gluten-celeborn/common/src/main/java/org/apache/spark/shuffle/gluten/celeborn/CelebornShuffleManager.java b/gluten-celeborn/common/src/main/java/org/apache/spark/shuffle/gluten/celeborn/CelebornShuffleManager.java index a0516d1774150..5e14531d6f19a 100644 --- a/gluten-celeborn/common/src/main/java/org/apache/spark/shuffle/gluten/celeborn/CelebornShuffleManager.java +++ b/gluten-celeborn/common/src/main/java/org/apache/spark/shuffle/gluten/celeborn/CelebornShuffleManager.java @@ -291,12 +291,18 @@ public ShuffleWriter getWriter( } @SuppressWarnings("unchecked") CelebornShuffleHandle h = ((CelebornShuffleHandle) handle); + + CelebornConf writerConf = celebornConf; + if (!(h.dependency() instanceof ColumnarShuffleDependency)) { + writerConf = rowBasedCelebornConf; + } + shuffleClient = CelebornUtils.getShuffleClient( h.appUniqueId(), h.lifecycleManagerHost(), h.lifecycleManagerPort(), - celebornConf, + writerConf, h.userIdentifier(), false, extension); diff --git a/gluten-celeborn/common/src/main/scala/org/apache/spark/shuffle/CelebornColumnarShuffleWriter.scala b/gluten-celeborn/common/src/main/scala/org/apache/spark/shuffle/CelebornColumnarShuffleWriter.scala index f71fadd4cd64e..3f7c3586ced28 100644 --- a/gluten-celeborn/common/src/main/scala/org/apache/spark/shuffle/CelebornColumnarShuffleWriter.scala +++ b/gluten-celeborn/common/src/main/scala/org/apache/spark/shuffle/CelebornColumnarShuffleWriter.scala @@ -94,7 +94,10 @@ abstract class CelebornColumnarShuffleWriter[K, V]( } protected val compressionLevel: Int = - GlutenShuffleUtils.getCompressionLevel(conf, customizedCompressionCodec, null) + GlutenShuffleUtils.getCompressionLevel( + conf, + customizedCompressionCodec, + GlutenConfig.getConf.columnarShuffleCodecBackend.orNull) protected val bufferCompressThreshold: Int = GlutenConfig.getConf.columnarShuffleCompressionThreshold diff --git a/gluten-celeborn/velox/src/main/scala/org/apache/spark/shuffle/VeloxCelebornColumnarBatchSerializer.scala b/gluten-celeborn/velox/src/main/scala/org/apache/spark/shuffle/VeloxCelebornColumnarBatchSerializer.scala index 6f21b528f1c2d..696a3c3438b0f 100644 --- a/gluten-celeborn/velox/src/main/scala/org/apache/spark/shuffle/VeloxCelebornColumnarBatchSerializer.scala +++ b/gluten-celeborn/velox/src/main/scala/org/apache/spark/shuffle/VeloxCelebornColumnarBatchSerializer.scala @@ -18,8 +18,8 @@ package org.apache.spark.shuffle import org.apache.gluten.GlutenConfig import org.apache.gluten.GlutenConfig.{GLUTEN_RSS_SORT_SHUFFLE_WRITER, GLUTEN_SORT_SHUFFLE_WRITER} -import org.apache.gluten.exec.Runtimes import org.apache.gluten.memory.arrow.alloc.ArrowBufferAllocators +import org.apache.gluten.runtime.Runtimes import org.apache.gluten.utils.ArrowAbiUtil import org.apache.gluten.vectorized._ diff --git a/gluten-celeborn/velox/src/main/scala/org/apache/spark/shuffle/VeloxCelebornColumnarShuffleWriter.scala b/gluten-celeborn/velox/src/main/scala/org/apache/spark/shuffle/VeloxCelebornColumnarShuffleWriter.scala index c14d46a52c125..b7a0beae704be 100644 --- a/gluten-celeborn/velox/src/main/scala/org/apache/spark/shuffle/VeloxCelebornColumnarShuffleWriter.scala +++ b/gluten-celeborn/velox/src/main/scala/org/apache/spark/shuffle/VeloxCelebornColumnarShuffleWriter.scala @@ -18,8 +18,8 @@ package org.apache.spark.shuffle import org.apache.gluten.GlutenConfig import org.apache.gluten.columnarbatch.ColumnarBatches -import org.apache.gluten.exec.Runtimes import org.apache.gluten.memory.memtarget.{MemoryTarget, Spiller, Spillers} +import org.apache.gluten.runtime.Runtimes import org.apache.gluten.vectorized._ import org.apache.spark._ diff --git a/gluten-core/src/main/java/org/apache/gluten/memory/SimpleMemoryUsageRecorder.java b/gluten-core/src/main/java/org/apache/gluten/memory/SimpleMemoryUsageRecorder.java index 16b260469f5f6..fb8b0d1e2b615 100644 --- a/gluten-core/src/main/java/org/apache/gluten/memory/SimpleMemoryUsageRecorder.java +++ b/gluten-core/src/main/java/org/apache/gluten/memory/SimpleMemoryUsageRecorder.java @@ -30,13 +30,13 @@ public class SimpleMemoryUsageRecorder implements MemoryUsageRecorder { @Override public void inc(long bytes) { final long total = this.current.addAndGet(bytes); - long prev_peak; + long prevPeak; do { - prev_peak = this.peak.get(); - if (total <= prev_peak) { + prevPeak = this.peak.get(); + if (total <= prevPeak) { break; } - } while (!this.peak.compareAndSet(prev_peak, total)); + } while (!this.peak.compareAndSet(prevPeak, total)); } // peak used bytes diff --git a/gluten-core/src/main/java/org/apache/gluten/memory/memtarget/MemoryTargets.java b/gluten-core/src/main/java/org/apache/gluten/memory/memtarget/MemoryTargets.java index 75e3db2e7d1f8..bb1e7102b1c3c 100644 --- a/gluten-core/src/main/java/org/apache/gluten/memory/memtarget/MemoryTargets.java +++ b/gluten-core/src/main/java/org/apache/gluten/memory/memtarget/MemoryTargets.java @@ -50,7 +50,7 @@ public static MemoryTarget dynamicOffHeapSizingIfEnabled(MemoryTarget memoryTarg return memoryTarget; } - public static MemoryTarget newConsumer( + public static TreeMemoryTarget newConsumer( TaskMemoryManager tmm, String name, Spiller spiller, diff --git a/gluten-core/src/main/java/org/apache/gluten/memory/memtarget/OverAcquire.java b/gluten-core/src/main/java/org/apache/gluten/memory/memtarget/OverAcquire.java index ac82161ba7a5d..e7321b4b7e0ed 100644 --- a/gluten-core/src/main/java/org/apache/gluten/memory/memtarget/OverAcquire.java +++ b/gluten-core/src/main/java/org/apache/gluten/memory/memtarget/OverAcquire.java @@ -52,31 +52,28 @@ public class OverAcquire implements MemoryTarget { @Override public long borrow(long size) { - Preconditions.checkArgument(size != 0, "Size to borrow is zero"); + if (size == 0) { + return 0; + } + Preconditions.checkState(overTarget.usedBytes() == 0); long granted = target.borrow(size); long majorSize = target.usedBytes(); - long expectedOverAcquired = (long) (ratio * majorSize); - long overAcquired = overTarget.usedBytes(); - long diff = expectedOverAcquired - overAcquired; - if (diff >= 0) { // otherwise, there might be a spill happened during the last borrow() call - overTarget.borrow(diff); // we don't have to check the returned value - } + long overSize = (long) (ratio * majorSize); + long overAcquired = overTarget.borrow(overSize); + Preconditions.checkState(overAcquired == overTarget.usedBytes()); + long releasedOverSize = overTarget.repay(overAcquired); + Preconditions.checkState(releasedOverSize == overAcquired); + Preconditions.checkState(overTarget.usedBytes() == 0); return granted; } @Override public long repay(long size) { - Preconditions.checkArgument(size != 0, "Size to repay is zero"); - long freed = target.repay(size); - // clean up the over-acquired target - long overAcquired = overTarget.usedBytes(); - long freedOverAcquired = overTarget.repay(overAcquired); - Preconditions.checkArgument( - freedOverAcquired == overAcquired, - "Freed over-acquired size is not equal to requested size"); - Preconditions.checkArgument( - overTarget.usedBytes() == 0, "Over-acquired target was not cleaned up"); - return freed; + if (size == 0) { + return 0; + } + Preconditions.checkState(overTarget.usedBytes() == 0); + return target.repay(size); } @Override diff --git a/gluten-core/src/main/java/org/apache/gluten/memory/memtarget/TreeMemoryTargets.java b/gluten-core/src/main/java/org/apache/gluten/memory/memtarget/TreeMemoryTargets.java index 24d9fc0e2d4ac..98f79bfff3679 100644 --- a/gluten-core/src/main/java/org/apache/gluten/memory/memtarget/TreeMemoryTargets.java +++ b/gluten-core/src/main/java/org/apache/gluten/memory/memtarget/TreeMemoryTargets.java @@ -114,6 +114,9 @@ private Node( @Override public long borrow(long size) { + if (size == 0) { + return 0; + } ensureFreeCapacity(size); return borrow0(Math.min(freeBytes(), size)); } @@ -154,6 +157,9 @@ private boolean ensureFreeCapacity(long bytesNeeded) { @Override public long repay(long size) { + if (size == 0) { + return 0; + } long toFree = Math.min(usedBytes(), size); long freed = parent.repay(toFree); selfRecorder.inc(-freed); diff --git a/gluten-core/src/main/java/org/apache/gluten/substrait/ddlplan/InsertOutputNode.java b/gluten-core/src/main/java/org/apache/gluten/substrait/ddlplan/InsertOutputNode.java deleted file mode 100644 index c5804e4e2a10c..0000000000000 --- a/gluten-core/src/main/java/org/apache/gluten/substrait/ddlplan/InsertOutputNode.java +++ /dev/null @@ -1,56 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.gluten.substrait.ddlplan; - -import com.google.protobuf.Any; -import com.google.protobuf.ByteString; -import io.substrait.proto.ReadRel; - -import java.io.Serializable; - -public class InsertOutputNode implements Serializable { - private static final String MERGE_TREE = "MergeTree;"; - private Long partsNum; - private String database = null; - private String tableName = null; - private String relativePath = null; - private StringBuffer extensionTableStr = new StringBuffer(MERGE_TREE); - - InsertOutputNode(Long partsNum, String database, String tableName, String relativePath) { - this.partsNum = partsNum; - this.database = database; - this.tableName = tableName; - this.relativePath = relativePath; - // MergeTree;{database}\n{table}\n{relative_path}\n{min_part}\n{max_part}\n - extensionTableStr - .append(database) - .append("\n") - .append(tableName) - .append("\n") - .append(relativePath) - .append("\n") - .append(this.partsNum) - .append("\n"); - } - - public ReadRel.ExtensionTable toProtobuf() { - ReadRel.ExtensionTable.Builder extensionTableBuilder = ReadRel.ExtensionTable.newBuilder(); - extensionTableBuilder.setDetail( - Any.newBuilder().setValue(ByteString.copyFromUtf8(extensionTableStr.toString()))); - return extensionTableBuilder.build(); - } -} diff --git a/gluten-core/src/main/java/org/apache/gluten/substrait/ddlplan/InsertPlanNode.java b/gluten-core/src/main/java/org/apache/gluten/substrait/ddlplan/InsertPlanNode.java deleted file mode 100644 index c6afa65780c22..0000000000000 --- a/gluten-core/src/main/java/org/apache/gluten/substrait/ddlplan/InsertPlanNode.java +++ /dev/null @@ -1,50 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.gluten.substrait.ddlplan; - -import org.apache.gluten.substrait.SubstraitContext; -import org.apache.gluten.substrait.plan.PlanNode; - -import io.substrait.proto.DllPlan; -import io.substrait.proto.InsertPlan; - -import java.io.Serializable; - -public class InsertPlanNode implements DllPlanNode, Serializable { - - private final PlanNode inputNode; - - private final SubstraitContext context; - - public InsertPlanNode(SubstraitContext context, PlanNode inputNode) { - this.inputNode = inputNode; - this.context = context; - } - - @Override - public DllPlan toProtobuf() { - InsertPlan.Builder insertBuilder = InsertPlan.newBuilder(); - insertBuilder.setInput(inputNode.toProtobuf()); - if (context.getInsertOutputNode() != null) { - insertBuilder.setOutput(context.getInsertOutputNode().toProtobuf()); - } - - DllPlan.Builder dllBuilder = DllPlan.newBuilder(); - dllBuilder.setInsertPlan(insertBuilder.build()); - return dllBuilder.build(); - } -} diff --git a/gluten-core/src/main/java/org/apache/gluten/substrait/rel/LocalFilesNode.java b/gluten-core/src/main/java/org/apache/gluten/substrait/rel/LocalFilesNode.java index 172a6e8cca69b..04bb9d8cf400f 100644 --- a/gluten-core/src/main/java/org/apache/gluten/substrait/rel/LocalFilesNode.java +++ b/gluten-core/src/main/java/org/apache/gluten/substrait/rel/LocalFilesNode.java @@ -104,7 +104,7 @@ private NamedStruct buildNamedStruct() { for (StructField field : fileSchema.fields()) { structBuilder.addTypes( ConverterUtils.getTypeNode(field.dataType(), field.nullable()).toProtobuf()); - namedStructBuilder.addNames(field.name()); + namedStructBuilder.addNames(ConverterUtils.normalizeColName(field.name())); } namedStructBuilder.setStruct(structBuilder.build()); } diff --git a/gluten-core/src/main/java/org/apache/gluten/vectorized/JniLibLoader.java b/gluten-core/src/main/java/org/apache/gluten/vectorized/JniLibLoader.java index b1feb1e5baebb..37cd29649c5c1 100644 --- a/gluten-core/src/main/java/org/apache/gluten/vectorized/JniLibLoader.java +++ b/gluten-core/src/main/java/org/apache/gluten/vectorized/JniLibLoader.java @@ -35,30 +35,20 @@ import java.util.Collections; import java.util.HashSet; import java.util.Iterator; -import java.util.LinkedHashMap; import java.util.LinkedHashSet; import java.util.List; -import java.util.Map; -import java.util.Objects; import java.util.Set; import java.util.Vector; -import java.util.concurrent.atomic.AtomicBoolean; -import java.util.concurrent.locks.Lock; -import java.util.concurrent.locks.ReentrantLock; -import java.util.stream.Collectors; -import java.util.stream.Stream; import scala.runtime.BoxedUnit; -/** - * LoadXXX methods in the utility prevents reloading of a library internally. It's not necessary for - * caller to manage a loaded library list. - */ public class JniLibLoader { private static final Logger LOG = LoggerFactory.getLogger(JniLibLoader.class); - private static final Set LOADED_LIBRARY_PATHS = new HashSet<>(); - private static final Set REQUIRE_UNLOAD_LIBRARY_PATHS = new LinkedHashSet<>(); + private static final Set LOADED_LIBRARY_PATHS = + Collections.synchronizedSet(new HashSet<>()); + private static final Set REQUIRE_UNLOAD_LIBRARY_PATHS = + Collections.synchronizedSet(new LinkedHashSet<>()); static { GlutenShutdownManager.addHookForLibUnloading( @@ -69,15 +59,17 @@ public class JniLibLoader { } private final String workDir; - private final Set loadedLibraries = new HashSet<>(); - private final Lock sync = new ReentrantLock(); + private final Set loadedLibraries = Collections.synchronizedSet(new HashSet<>()); JniLibLoader(String workDir) { this.workDir = workDir; } - public static synchronized void forceUnloadAll() { - List loaded = new ArrayList<>(REQUIRE_UNLOAD_LIBRARY_PATHS); + public static void forceUnloadAll() { + List loaded; + synchronized (REQUIRE_UNLOAD_LIBRARY_PATHS) { + loaded = new ArrayList<>(REQUIRE_UNLOAD_LIBRARY_PATHS); + } Collections.reverse(loaded); // use reversed order to unload loaded.forEach(JniLibLoader::unloadFromPath); } @@ -95,21 +87,25 @@ private static String toRealPath(String libPath) { } } - private static synchronized void loadFromPath0(String libPath, boolean requireUnload) { + private static void loadFromPath0(String libPath, boolean requireUnload) { libPath = toRealPath(libPath); - if (LOADED_LIBRARY_PATHS.contains(libPath)) { - LOG.debug("Library in path {} has already been loaded, skipping", libPath); - } else { - System.load(libPath); - LOADED_LIBRARY_PATHS.add(libPath); - LOG.info("Library {} has been loaded using path-loading method", libPath); + synchronized (LOADED_LIBRARY_PATHS) { + if (LOADED_LIBRARY_PATHS.contains(libPath)) { + LOG.debug("Library in path {} has already been loaded, skipping", libPath); + } else { + System.load(libPath); + LOADED_LIBRARY_PATHS.add(libPath); + LOG.info("Library {} has been loaded using path-loading method", libPath); + } } if (requireUnload) { - REQUIRE_UNLOAD_LIBRARY_PATHS.add(libPath); + synchronized (REQUIRE_UNLOAD_LIBRARY_PATHS) { + REQUIRE_UNLOAD_LIBRARY_PATHS.add(libPath); + } } } - public static void loadFromPath(String libPath, boolean requireUnload) { + public static synchronized void loadFromPath(String libPath, boolean requireUnload) { final File file = new File(libPath); if (!file.isFile() || !file.exists()) { throw new GlutenException("library at path: " + libPath + " is not a file or does not exist"); @@ -117,55 +113,42 @@ public static void loadFromPath(String libPath, boolean requireUnload) { loadFromPath0(file.getAbsolutePath(), requireUnload); } - public void mapAndLoad(String unmappedLibName, boolean requireUnload) { - newTransaction().mapAndLoad(unmappedLibName, requireUnload).commit(); - } - - public void load(String libName, boolean requireUnload) { - newTransaction().load(libName, requireUnload).commit(); - } - - public void loadAndCreateLink(String libName, String linkName, boolean requireUnload) { - newTransaction().loadAndCreateLink(libName, linkName, requireUnload).commit(); - } - - public JniLoadTransaction newTransaction() { - return new JniLoadTransaction(); - } - - public static synchronized void unloadFromPath(String libPath) { - if (!LOADED_LIBRARY_PATHS.remove(libPath)) { - LOG.warn("Library {} was not loaded or alreay unloaded:", libPath); - return; + public static void unloadFromPath(String libPath) { + synchronized (LOADED_LIBRARY_PATHS) { + if (!LOADED_LIBRARY_PATHS.remove(libPath)) { + LOG.warn("Library {} was not loaded or already unloaded:", libPath); + return; + } } - LOG.info("Starting unload library path: {} ", libPath); - REQUIRE_UNLOAD_LIBRARY_PATHS.remove(libPath); - + synchronized (REQUIRE_UNLOAD_LIBRARY_PATHS) { + REQUIRE_UNLOAD_LIBRARY_PATHS.remove(libPath); + } try { ClassLoader classLoader = JniLibLoader.class.getClassLoader(); Field field = ClassLoader.class.getDeclaredField("nativeLibraries"); field.setAccessible(true); Vector libs = (Vector) field.get(classLoader); - Iterator it = libs.iterator(); - while (it.hasNext()) { - Object object = it.next(); - Field[] fs = object.getClass().getDeclaredFields(); - for (int k = 0; k < fs.length; k++) { - if (fs[k].getName().equals("name")) { - fs[k].setAccessible(true); - - String verbosePath = fs[k].get(object).toString(); - File verboseFile = new File(verbosePath); - String verboseFileName = verboseFile.getName(); - File libFile = new File(libPath); - String libFileName = libFile.getName(); - - if (verboseFileName.equals(libFileName)) { - LOG.info("Finalizing library file: {}", libFileName); - Method finalize = object.getClass().getDeclaredMethod("finalize"); - finalize.setAccessible(true); - finalize.invoke(object); + synchronized (libs) { + Iterator it = libs.iterator(); + while (it.hasNext()) { + Object object = it.next(); + Field[] fs = object.getClass().getDeclaredFields(); + for (int k = 0; k < fs.length; k++) { + if (fs[k].getName().equals("name")) { + fs[k].setAccessible(true); + String verbosePath = fs[k].get(object).toString(); + File verboseFile = new File(verbosePath); + String verboseFileName = verboseFile.getName(); + File libFile = new File(libPath); + String libFileName = libFile.getName(); + + if (verboseFileName.equals(libFileName)) { + LOG.info("Finalizing library file: {}", libFileName); + Method finalize = object.getClass().getDeclaredMethod("finalize"); + finalize.setAccessible(true); + finalize.invoke(object); + } } } } @@ -175,160 +158,77 @@ public static synchronized void unloadFromPath(String libPath) { } } - private static final class LoadRequest { - final String libName; - final String linkName; - final boolean requireUnload; - - private LoadRequest(String libName, String linkName, boolean requireUnload) { - this.libName = libName; - this.linkName = linkName; - this.requireUnload = requireUnload; - } - } - - private static final class LoadAction { - final String libName; - final String linkName; - final boolean requireUnload; - final File file; - - private LoadAction(String libName, String linkName, boolean requireUnload, File file) { - this.libName = libName; - this.linkName = linkName; - this.requireUnload = requireUnload; - this.file = file; - } - - public boolean requireLinking() { - return !Objects.isNull(linkName); - } - } - - public class JniLoadTransaction { - private final AtomicBoolean finished = new AtomicBoolean(false); - private final Map toLoad = new LinkedHashMap<>(); // ordered - - private JniLoadTransaction() { - JniLibLoader.this.sync.lock(); - } - - public JniLoadTransaction mapAndLoad(String unmappedLibName, boolean requireUnload) { + public void mapAndLoad(String unmappedLibName, boolean requireUnload) { + synchronized (loadedLibraries) { try { final String mappedLibName = System.mapLibraryName(unmappedLibName); load(mappedLibName, requireUnload); - return this; } catch (Exception e) { - abort(); throw new GlutenException(e); } } + } - public JniLoadTransaction load(String libName, boolean requireUnload) { + public void load(String libName, boolean requireUnload) { + synchronized (loadedLibraries) { try { - toLoad.put(libName, new LoadRequest(libName, null, requireUnload)); - return this; - } catch (Exception e) { - abort(); + if (loadedLibraries.contains(libName)) { + LOG.debug("Library {} has already been loaded, skipping", libName); + return; + } + File file = moveToWorkDir(workDir, libName); + loadWithLink(file.getAbsolutePath(), null, requireUnload); + loadedLibraries.add(libName); + LOG.info("Successfully loaded library {}", libName); + } catch (IOException e) { throw new GlutenException(e); } } + } - public JniLoadTransaction loadAndCreateLink( - String libName, String linkName, boolean requireUnload) { + public void loadAndCreateLink(String libName, String linkName, boolean requireUnload) { + synchronized (loadedLibraries) { try { - toLoad.put(libName, new LoadRequest(libName, linkName, requireUnload)); - return this; - } catch (Exception e) { - abort(); + if (loadedLibraries.contains(libName)) { + LOG.debug("Library {} has already been loaded, skipping", libName); + } + File file = moveToWorkDir(workDir, System.mapLibraryName(libName)); + loadWithLink(file.getAbsolutePath(), linkName, requireUnload); + loadedLibraries.add(libName); + LOG.info("Successfully loaded library {}", libName); + } catch (IOException e) { throw new GlutenException(e); } } + } - public void commit() { - try { - terminate(); - toLoad.entrySet().stream() - .flatMap( - e -> { - try { - final LoadRequest req = e.getValue(); - if (loadedLibraries.contains(req.libName)) { - LOG.debug("Library {} has already been loaded, skipping", req.libName); - return Stream.empty(); - } - // load only libraries not loaded yet - final File file = moveToWorkDir(workDir, req.libName); - return Stream.of( - new LoadAction(req.libName, req.linkName, req.requireUnload, file)); - } catch (IOException ex) { - throw new GlutenException(ex); - } - }) - .collect(Collectors.toList()) - .forEach( - e -> { - try { - LOG.info("Trying to load library {}", e.libName); - loadWithLink(workDir, e); - loadedLibraries.add(e.libName); - LOG.info("Successfully loaded library {}", e.libName); - } catch (Exception ex) { - throw new GlutenException(ex); - } - }); - } finally { - JniLibLoader.this.sync.unlock(); - } - } - - public void abort() { - try { - terminate(); - // do nothing as of now - } finally { - JniLibLoader.this.sync.unlock(); - } - } - - private void terminate() { - if (!finished.compareAndSet(false, true)) { - throw new IllegalStateException(); - } + private File moveToWorkDir(String workDir, String libraryToLoad) throws IOException { + // final File temp = File.createTempFile(workDir, libraryToLoad); + final Path libPath = Paths.get(workDir + "/" + libraryToLoad); + if (Files.exists(libPath)) { + Files.delete(libPath); } - - private File moveToWorkDir(String workDir, String libraryToLoad) throws IOException { - // final File temp = File.createTempFile(workDir, libraryToLoad); - final Path libPath = Paths.get(workDir + "/" + libraryToLoad); - if (Files.exists(libPath)) { - Files.delete(libPath); + final File temp = new File(workDir + "/" + libraryToLoad); + try (InputStream is = JniLibLoader.class.getClassLoader().getResourceAsStream(libraryToLoad)) { + if (is == null) { + throw new FileNotFoundException(libraryToLoad); } - final File temp = new File(workDir + "/" + libraryToLoad); - try (InputStream is = - JniLibLoader.class.getClassLoader().getResourceAsStream(libraryToLoad)) { - if (is == null) { - throw new FileNotFoundException(libraryToLoad); - } - try { - Files.copy(is, temp.toPath()); - } catch (Exception e) { - throw new GlutenException(e); - } + try { + Files.copy(is, temp.toPath()); + } catch (Exception e) { + throw new GlutenException(e); } - return temp; } + return temp; + } - private void loadWithLink(String workDir, LoadAction req) throws IOException { - String libPath = req.file.getAbsolutePath(); - loadFromPath0(libPath, req.requireUnload); - LOG.info("Library {} has been loaded", libPath); - if (!req.requireLinking()) { - LOG.debug("Symbolic link not required for library {}, skipping", libPath); - return; - } - // create link - Path target = Paths.get(req.file.getPath()); - Path link = Paths.get(workDir, req.linkName); + private void loadWithLink(String libPath, String linkName, boolean requireUnload) + throws IOException { + loadFromPath0(libPath, requireUnload); + LOG.info("Library {} has been loaded", libPath); + if (linkName != null) { + Path target = Paths.get(libPath); + Path link = Paths.get(workDir, linkName); if (Files.exists(link)) { LOG.info("Symbolic link already exists for library {}, deleting", libPath); Files.delete(link); diff --git a/gluten-core/src/main/resources/substrait/proto/substrait/ddl.proto b/gluten-core/src/main/resources/substrait/proto/substrait/ddl.proto deleted file mode 100644 index 833ec87369ae8..0000000000000 --- a/gluten-core/src/main/resources/substrait/proto/substrait/ddl.proto +++ /dev/null @@ -1,25 +0,0 @@ -syntax = "proto3"; - -package substrait; - -import "substrait/plan.proto"; -import "substrait/algebra.proto"; - -option java_multiple_files = true; -option java_package = "io.substrait.proto"; -option csharp_namespace = "Substrait.Protobuf"; - -message DllPlan { - oneof dll_type { - InsertPlan insert_plan = 1; - } -} - -message InsertPlan { - Plan input = 1; - ReadRel.ExtensionTable output = 2; -} - -message Dll { - repeated DllPlan dll_plan = 1; -} \ No newline at end of file diff --git a/gluten-core/src/main/scala/org/apache/gluten/GlutenPlugin.scala b/gluten-core/src/main/scala/org/apache/gluten/GlutenPlugin.scala index 16929ca4bd4a7..6e3484dfa969a 100644 --- a/gluten-core/src/main/scala/org/apache/gluten/GlutenPlugin.scala +++ b/gluten-core/src/main/scala/org/apache/gluten/GlutenPlugin.scala @@ -17,12 +17,11 @@ package org.apache.gluten import org.apache.gluten.GlutenConfig.GLUTEN_DEFAULT_SESSION_TIMEZONE_KEY -import org.apache.gluten.GlutenPlugin.{GLUTEN_SESSION_EXTENSION_NAME, SPARK_SESSION_EXTS_KEY} import org.apache.gluten.backendsapi.BackendsApiManager import org.apache.gluten.events.GlutenBuildInfoEvent import org.apache.gluten.exception.GlutenException import org.apache.gluten.expression.ExpressionMappings -import org.apache.gluten.extension.{ColumnarOverrides, OthersExtensionOverrides, QueryStagePrepOverrides} +import org.apache.gluten.extension.GlutenSessionExtensions.{GLUTEN_SESSION_EXTENSION_NAME, SPARK_SESSION_EXTS_KEY} import org.apache.gluten.test.TestStats import org.apache.gluten.utils.TaskListener @@ -31,14 +30,13 @@ import org.apache.spark.api.plugin.{DriverPlugin, ExecutorPlugin, PluginContext, import org.apache.spark.internal.Logging import org.apache.spark.listener.GlutenListenerFactory import org.apache.spark.network.util.JavaUtils -import org.apache.spark.sql.SparkSessionExtensions import org.apache.spark.sql.execution.ui.GlutenEventUtils -import org.apache.spark.sql.internal.{SQLConf, StaticSQLConf} +import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.utils.ExpressionUtil import org.apache.spark.util.{SparkResourceUtil, TaskResources} import java.util -import java.util.{Collections, Objects} +import java.util.Collections import scala.collection.mutable @@ -74,10 +72,15 @@ private[gluten] class GlutenDriverPlugin extends DriverPlugin with Logging { BackendsApiManager.initialize() BackendsApiManager.getListenerApiInstance.onDriverStart(sc, pluginContext) GlutenListenerFactory.addToSparkListenerBus(sc) - ExpressionMappings.expressionExtensionTransformer = - ExpressionUtil.extendedExpressionTransformer( - conf.get(GlutenConfig.GLUTEN_EXTENDED_EXPRESSION_TRAN_CONF, "") - ) + + val expressionExtensionTransformer = ExpressionUtil.extendedExpressionTransformer( + conf.get(GlutenConfig.GLUTEN_EXTENDED_EXPRESSION_TRAN_CONF, "") + ) + + if (expressionExtensionTransformer != null) { + ExpressionMappings.expressionExtensionTransformer = expressionExtensionTransformer + } + Collections.emptyMap() } @@ -293,25 +296,4 @@ private[gluten] class GlutenExecutorPlugin extends ExecutorPlugin { } } -private[gluten] class GlutenSessionExtensions extends (SparkSessionExtensions => Unit) { - override def apply(exts: SparkSessionExtensions): Unit = { - GlutenPlugin.DEFAULT_INJECTORS.foreach(injector => injector.inject(exts)) - } -} - -private[gluten] trait GlutenSparkExtensionsInjector { - def inject(extensions: SparkSessionExtensions): Unit -} - -private[gluten] object GlutenPlugin { - val SPARK_SESSION_EXTS_KEY: String = StaticSQLConf.SPARK_SESSION_EXTENSIONS.key - val GLUTEN_SESSION_EXTENSION_NAME: String = - Objects.requireNonNull(classOf[GlutenSessionExtensions].getCanonicalName) - - /** Specify all injectors that Gluten is using in following list. */ - val DEFAULT_INJECTORS: List[GlutenSparkExtensionsInjector] = List( - QueryStagePrepOverrides, - ColumnarOverrides, - OthersExtensionOverrides - ) -} +private object GlutenPlugin {} diff --git a/gluten-core/src/main/scala/org/apache/gluten/backendsapi/Backend.scala b/gluten-core/src/main/scala/org/apache/gluten/backendsapi/Backend.scala index 2c465ac619936..3a597552207b4 100644 --- a/gluten-core/src/main/scala/org/apache/gluten/backendsapi/Backend.scala +++ b/gluten-core/src/main/scala/org/apache/gluten/backendsapi/Backend.scala @@ -33,6 +33,8 @@ trait Backend { def listenerApi(): ListenerApi + def ruleApi(): RuleApi + def settings(): BackendSettingsApi } diff --git a/gluten-core/src/main/scala/org/apache/gluten/backendsapi/BackendSettingsApi.scala b/gluten-core/src/main/scala/org/apache/gluten/backendsapi/BackendSettingsApi.scala index 358043cc5f6be..c9205bae9d8fe 100644 --- a/gluten-core/src/main/scala/org/apache/gluten/backendsapi/BackendSettingsApi.scala +++ b/gluten-core/src/main/scala/org/apache/gluten/backendsapi/BackendSettingsApi.scala @@ -31,10 +31,11 @@ import org.apache.spark.sql.execution.datasources.{FileFormat, InsertIntoHadoopF import org.apache.spark.sql.types.StructField trait BackendSettingsApi { - def supportFileFormatRead( + def validateScan( format: ReadFileFormat, fields: Array[StructField], partTable: Boolean, + rootPaths: Seq[String], paths: Seq[String]): ValidationResult = ValidationResult.succeeded def supportWriteFilesExec( format: FileFormat, @@ -68,7 +69,6 @@ trait BackendSettingsApi { case _ => false } def supportStructType(): Boolean = false - def fallbackOnEmptySchema(plan: SparkPlan): Boolean = false // Whether to fallback aggregate at the same time if its empty-output child is fallen back. def fallbackAggregateWithEmptyOutputChild(): Boolean = false @@ -89,12 +89,6 @@ trait BackendSettingsApi { def excludeScanExecFromCollapsedStage(): Boolean = false def rescaleDecimalArithmetic: Boolean = false - /** - * Whether to replace sort agg with hash agg., e.g., sort agg will be used in spark's planning for - * string type input. - */ - def replaceSortAggWithHashAgg: Boolean = GlutenConfig.getConf.forceToUseHashAgg - /** Get the config prefix for each backend */ def getBackendConfigPrefix: String @@ -146,9 +140,6 @@ trait BackendSettingsApi { def supportSampleExec(): Boolean = false - /** Merge two phases hash based aggregate if need */ - def mergeTwoPhasesHashBaseAggregateIfNeed(): Boolean = false - def supportColumnarArrowUdf(): Boolean = false def generateHdfsConfForLibhdfs(): Boolean = false diff --git a/gluten-core/src/main/scala/org/apache/gluten/backendsapi/BackendsApiManager.scala b/gluten-core/src/main/scala/org/apache/gluten/backendsapi/BackendsApiManager.scala index f2c93d8c70fc7..16aa9161eba06 100644 --- a/gluten-core/src/main/scala/org/apache/gluten/backendsapi/BackendsApiManager.scala +++ b/gluten-core/src/main/scala/org/apache/gluten/backendsapi/BackendsApiManager.scala @@ -83,6 +83,10 @@ object BackendsApiManager { backend.metricsApi() } + def getRuleApiInstance: RuleApi = { + backend.ruleApi() + } + def getSettings: BackendSettingsApi = { backend.settings } diff --git a/gluten-core/src/main/java/org/apache/gluten/substrait/ddlplan/DllTransformContext.scala b/gluten-core/src/main/scala/org/apache/gluten/backendsapi/RuleApi.scala similarity index 64% rename from gluten-core/src/main/java/org/apache/gluten/substrait/ddlplan/DllTransformContext.scala rename to gluten-core/src/main/scala/org/apache/gluten/backendsapi/RuleApi.scala index d63bb839f9862..f8669a6fe0495 100644 --- a/gluten-core/src/main/java/org/apache/gluten/substrait/ddlplan/DllTransformContext.scala +++ b/gluten-core/src/main/scala/org/apache/gluten/backendsapi/RuleApi.scala @@ -14,15 +14,11 @@ * See the License for the specific language governing permissions and * limitations under the License. */ +package org.apache.gluten.backendsapi -package org.apache.gluten.substrait.ddlplan +import org.apache.gluten.extension.injector.RuleInjector -import org.apache.gluten.substrait.SubstraitContext - -import org.apache.spark.sql.catalyst.expressions.Attribute - -case class DllTransformContext( - inputAttributes: Seq[Attribute], - outputAttributes: Seq[Attribute], - root: DllNode, - substraitContext: SubstraitContext = null) +trait RuleApi { + // Injects all Gluten / Spark query planner rules used by the backend. + def injectRules(injector: RuleInjector): Unit +} diff --git a/gluten-core/src/main/scala/org/apache/gluten/backendsapi/SparkPlanExecApi.scala b/gluten-core/src/main/scala/org/apache/gluten/backendsapi/SparkPlanExecApi.scala index 8f24afae1da48..0227ed5da1276 100644 --- a/gluten-core/src/main/scala/org/apache/gluten/backendsapi/SparkPlanExecApi.scala +++ b/gluten-core/src/main/scala/org/apache/gluten/backendsapi/SparkPlanExecApi.scala @@ -27,21 +27,15 @@ import org.apache.spark.ShuffleDependency import org.apache.spark.rdd.RDD import org.apache.spark.serializer.Serializer import org.apache.spark.shuffle.{GenShuffleWriterParameters, GlutenShuffleWriterWrapper} -import org.apache.spark.sql.{SparkSession, Strategy} -import org.apache.spark.sql.catalyst.FunctionIdentifier -import org.apache.spark.sql.catalyst.analysis.FunctionRegistry.FunctionBuilder import org.apache.spark.sql.catalyst.catalog.BucketSpec import org.apache.spark.sql.catalyst.catalog.CatalogTypes.TablePartitionSpec import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.expressions.aggregate.AggregateExpression import org.apache.spark.sql.catalyst.optimizer.BuildSide -import org.apache.spark.sql.catalyst.parser.ParserInterface import org.apache.spark.sql.catalyst.plans.JoinType -import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan import org.apache.spark.sql.catalyst.plans.physical.{BroadcastMode, Partitioning} -import org.apache.spark.sql.catalyst.rules.Rule -import org.apache.spark.sql.execution.{BackendWrite, ColumnarWriteFilesExec, FileSourceScanExec, GenerateExec, LeafExecNode, SparkPlan} -import org.apache.spark.sql.execution.datasources.{FileFormat, WriteJobDescription} +import org.apache.spark.sql.execution._ +import org.apache.spark.sql.execution.datasources.FileFormat import org.apache.spark.sql.execution.datasources.v2.{BatchScanExec, FileScan} import org.apache.spark.sql.execution.exchange.ShuffleExchangeExec import org.apache.spark.sql.execution.joins.BuildSideRelation @@ -164,16 +158,6 @@ trait SparkPlanExecApi { original: Expression): ExpressionTransformer = AliasTransformer(substraitExprName, child, original) - /** Generate SplitTransformer. */ - def genStringSplitTransformer( - substraitExprName: String, - srcExpr: ExpressionTransformer, - regexExpr: ExpressionTransformer, - limitExpr: ExpressionTransformer, - original: StringSplit): ExpressionTransformer = { - GenericExpressionTransformer(substraitExprName, Seq(srcExpr, regexExpr, limitExpr), original) - } - /** Generate an expression transformer to transform GetMapValue to Substrait. */ def genGetMapValueTransformer( substraitExprName: String, @@ -353,6 +337,9 @@ trait SparkPlanExecApi { metrics: Map[String, SQLMetric], isSort: Boolean): ShuffleDependency[Int, ColumnarBatch, ColumnarBatch] + /** Determine whether to use sort-based shuffle based on shuffle partitioning and output. */ + def useSortBasedShuffle(partitioning: Partitioning, output: Seq[Attribute]): Boolean + /** * Generate ColumnarShuffleWriter for ColumnarShuffleManager. * @@ -385,22 +372,12 @@ trait SparkPlanExecApi { /** Create ColumnarWriteFilesExec */ def createColumnarWriteFilesExec( child: SparkPlan, + noop: SparkPlan, fileFormat: FileFormat, partitionColumns: Seq[Attribute], bucketSpec: Option[BucketSpec], options: Map[String, String], - staticPartitions: TablePartitionSpec): SparkPlan = { - ColumnarWriteFilesExec( - child, - fileFormat, - partitionColumns, - bucketSpec, - options, - staticPartitions) - } - - /** Create BackendWrite */ - def createBackendWrite(description: WriteJobDescription): BackendWrite + staticPartitions: TablePartitionSpec): ColumnarWriteFilesExec /** Create ColumnarArrowEvalPythonExec, for velox backend */ def createColumnarArrowEvalPythonExec( @@ -409,69 +386,6 @@ trait SparkPlanExecApi { child: SparkPlan, evalType: Int): SparkPlan - /** - * Generate extended DataSourceV2 Strategies. Currently only for ClickHouse backend. - * - * @return - */ - def genExtendedDataSourceV2Strategies(): List[SparkSession => Strategy] - - /** - * Generate extended query stage preparation rules. - * - * @return - */ - def genExtendedQueryStagePrepRules(): List[SparkSession => Rule[SparkPlan]] - - /** - * Generate extended Analyzers. Currently only for ClickHouse backend. - * - * @return - */ - def genExtendedAnalyzers(): List[SparkSession => Rule[LogicalPlan]] - - /** - * Generate extended Optimizers. Currently only for Velox backend. - * - * @return - */ - def genExtendedOptimizers(): List[SparkSession => Rule[LogicalPlan]] - - /** - * Generate extended Strategies - * - * @return - */ - def genExtendedStrategies(): List[SparkSession => Strategy] - - /** - * Generate extended columnar pre-rules, in the validation phase. - * - * @return - */ - def genExtendedColumnarValidationRules(): List[SparkSession => Rule[SparkPlan]] - - /** - * Generate extended columnar transform-rules. - * - * @return - */ - def genExtendedColumnarTransformRules(): List[SparkSession => Rule[SparkPlan]] - - /** - * Generate extended columnar post-rules. - * - * @return - */ - def genExtendedColumnarPostRules(): List[SparkSession => Rule[SparkPlan]] = { - SparkShimLoader.getSparkShims.getExtendedColumnarPostRules() ::: List() - } - - def genInjectPostHocResolutionRules(): List[SparkSession => Rule[LogicalPlan]] - - def genInjectExtendedParser(): List[(SparkSession, ParserInterface) => ParserInterface] = - List.empty - def genGetStructFieldTransformer( substraitExprName: String, childTransformer: ExpressionTransformer, @@ -682,8 +596,6 @@ trait SparkPlanExecApi { } } - def genInjectedFunctions(): Seq[(FunctionIdentifier, ExpressionInfo, FunctionBuilder)] = Seq.empty - def rewriteSpillPath(path: String): String = path /** diff --git a/gluten-core/src/main/scala/org/apache/gluten/execution/BaseDataSource.scala b/gluten-core/src/main/scala/org/apache/gluten/execution/BaseDataSource.scala index b34abd4903018..1a0ff3f845671 100644 --- a/gluten-core/src/main/scala/org/apache/gluten/execution/BaseDataSource.scala +++ b/gluten-core/src/main/scala/org/apache/gluten/execution/BaseDataSource.scala @@ -32,4 +32,6 @@ trait BaseDataSource { /** Returns the input file paths, used to validate the partition column path */ def getInputFilePathsInternal: Seq[String] + + def getRootPathsInternal: Seq[String] } diff --git a/gluten-core/src/main/scala/org/apache/gluten/execution/BasicScanExecTransformer.scala b/gluten-core/src/main/scala/org/apache/gluten/execution/BasicScanExecTransformer.scala index 04697280d7996..b7953b3acab69 100644 --- a/gluten-core/src/main/scala/org/apache/gluten/execution/BasicScanExecTransformer.scala +++ b/gluten-core/src/main/scala/org/apache/gluten/execution/BasicScanExecTransformer.scala @@ -16,6 +16,7 @@ */ package org.apache.gluten.execution +import org.apache.gluten.GlutenConfig import org.apache.gluten.backendsapi.BackendsApiManager import org.apache.gluten.expression.{ConverterUtils, ExpressionConverter} import org.apache.gluten.extension.ValidationResult @@ -59,6 +60,14 @@ trait BasicScanExecTransformer extends LeafTransformSupport with BaseDataSource } } + def getRootFilePaths: Seq[String] = { + if (GlutenConfig.getConf.scanFileSchemeValidationEnabled) { + getRootPathsInternal + } else { + Seq.empty + } + } + /** Returns the file format properties. */ def getProperties: Map[String, String] = Map.empty @@ -92,7 +101,12 @@ trait BasicScanExecTransformer extends LeafTransformSupport with BaseDataSource } val validationResult = BackendsApiManager.getSettings - .supportFileFormatRead(fileFormat, fields, getPartitionSchema.nonEmpty, getInputFilePaths) + .validateScan( + fileFormat, + fields, + getPartitionSchema.nonEmpty, + getRootFilePaths, + getInputFilePaths) if (!validationResult.ok()) { return validationResult } diff --git a/gluten-core/src/main/scala/org/apache/gluten/execution/BatchScanExecTransformer.scala b/gluten-core/src/main/scala/org/apache/gluten/execution/BatchScanExecTransformer.scala index 4860847de9acd..553c7c4e0e7a7 100644 --- a/gluten-core/src/main/scala/org/apache/gluten/execution/BatchScanExecTransformer.scala +++ b/gluten-core/src/main/scala/org/apache/gluten/execution/BatchScanExecTransformer.scala @@ -22,6 +22,7 @@ import org.apache.gluten.extension.ValidationResult import org.apache.gluten.metrics.MetricsUpdater import org.apache.gluten.sql.shims.SparkShimLoader import org.apache.gluten.substrait.rel.LocalFilesNode.ReadFileFormat +import org.apache.gluten.utils.FileIndexUtil import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions._ @@ -131,6 +132,14 @@ abstract class BatchScanExecTransformerBase( } } + override def getRootPathsInternal: Seq[String] = { + scan match { + case fileScan: FileScan => + FileIndexUtil.getRootPath(fileScan.fileIndex) + case _ => Seq.empty + } + } + override def doValidateInternal(): ValidationResult = { if (pushedAggregate.nonEmpty) { return ValidationResult.failed(s"Unsupported aggregation push down for $scan.") diff --git a/gluten-core/src/main/scala/org/apache/gluten/execution/FileSourceScanExecTransformer.scala b/gluten-core/src/main/scala/org/apache/gluten/execution/FileSourceScanExecTransformer.scala index 3b8ed1167afcf..af49cfd1ba026 100644 --- a/gluten-core/src/main/scala/org/apache/gluten/execution/FileSourceScanExecTransformer.scala +++ b/gluten-core/src/main/scala/org/apache/gluten/execution/FileSourceScanExecTransformer.scala @@ -21,6 +21,7 @@ import org.apache.gluten.extension.ValidationResult import org.apache.gluten.metrics.MetricsUpdater import org.apache.gluten.sql.shims.SparkShimLoader import org.apache.gluten.substrait.rel.LocalFilesNode.ReadFileFormat +import org.apache.gluten.utils.FileIndexUtil import org.apache.spark.sql.catalyst.TableIdentifier import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference, Expression, PlanExpression} @@ -126,6 +127,10 @@ abstract class FileSourceScanExecTransformerBase( relation.location.inputFiles.toSeq } + override def getRootPathsInternal: Seq[String] = { + FileIndexUtil.getRootPath(relation.location) + } + override protected def doValidateInternal(): ValidationResult = { if ( !metadataColumns.isEmpty && !BackendsApiManager.getSettings.supportNativeMetadataColumns() diff --git a/gluten-core/src/main/scala/org/apache/gluten/expression/ExpressionConverter.scala b/gluten-core/src/main/scala/org/apache/gluten/expression/ExpressionConverter.scala index 3ca66b51897b0..8bca5dbf86052 100644 --- a/gluten-core/src/main/scala/org/apache/gluten/expression/ExpressionConverter.scala +++ b/gluten-core/src/main/scala/org/apache/gluten/expression/ExpressionConverter.scala @@ -377,14 +377,6 @@ object ExpressionConverter extends SQLConfHelper with Logging { replaceWithExpressionTransformerInternal(t.replaceExpr, attributeSeq, expressionsMap), t ) - case s: StringSplit => - BackendsApiManager.getSparkPlanExecApiInstance.genStringSplitTransformer( - substraitExprName, - replaceWithExpressionTransformerInternal(s.str, attributeSeq, expressionsMap), - replaceWithExpressionTransformerInternal(s.regex, attributeSeq, expressionsMap), - replaceWithExpressionTransformerInternal(s.limit, attributeSeq, expressionsMap), - s - ) case r: RegExpReplace => BackendsApiManager.getSparkPlanExecApiInstance.genRegexpReplaceTransformer( substraitExprName, diff --git a/gluten-core/src/main/scala/org/apache/gluten/expression/ExpressionMappings.scala b/gluten-core/src/main/scala/org/apache/gluten/expression/ExpressionMappings.scala index ebf0c5139245d..f2bb4a90621af 100644 --- a/gluten-core/src/main/scala/org/apache/gluten/expression/ExpressionMappings.scala +++ b/gluten-core/src/main/scala/org/apache/gluten/expression/ExpressionMappings.scala @@ -19,7 +19,7 @@ package org.apache.gluten.expression import org.apache.gluten.GlutenConfig import org.apache.gluten.backendsapi.BackendsApiManager import org.apache.gluten.expression.ExpressionNames._ -import org.apache.gluten.extension.ExpressionExtensionTrait +import org.apache.gluten.extension.{DefaultExpressionExtensionTransformer, ExpressionExtensionTrait} import org.apache.gluten.sql.shims.SparkShimLoader import org.apache.spark.sql.catalyst.expressions._ @@ -105,6 +105,7 @@ object ExpressionMappings { Sig[Levenshtein](LEVENSHTEIN), Sig[UnBase64](UNBASE64), Sig[Base64](BASE64), + Sig[FormatString](FORMAT_STRING), // URL functions Sig[ParseUrl](PARSE_URL), @@ -353,5 +354,6 @@ object ExpressionMappings { .toMap[Class[_], String] } - var expressionExtensionTransformer: ExpressionExtensionTrait = _ + var expressionExtensionTransformer: ExpressionExtensionTrait = + DefaultExpressionExtensionTransformer() } diff --git a/gluten-core/src/main/scala/org/apache/gluten/extension/ColumnarOverrides.scala b/gluten-core/src/main/scala/org/apache/gluten/extension/ColumnarOverrides.scala index 067976b63b2c6..c5a9afec32109 100644 --- a/gluten-core/src/main/scala/org/apache/gluten/extension/ColumnarOverrides.scala +++ b/gluten-core/src/main/scala/org/apache/gluten/extension/ColumnarOverrides.scala @@ -16,17 +16,14 @@ */ package org.apache.gluten.extension -import org.apache.gluten.{GlutenConfig, GlutenSparkExtensionsInjector} import org.apache.gluten.extension.columnar._ -import org.apache.gluten.extension.columnar.enumerated.EnumeratedApplier -import org.apache.gluten.extension.columnar.heuristic.HeuristicApplier import org.apache.gluten.extension.columnar.transition.Transitions import org.apache.gluten.utils.LogLevelUtil import org.apache.spark.broadcast.Broadcast import org.apache.spark.internal.Logging import org.apache.spark.rdd.RDD -import org.apache.spark.sql.{SparkSession, SparkSessionExtensions} +import org.apache.spark.sql.SparkSession import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.Attribute import org.apache.spark.sql.catalyst.rules.Rule @@ -95,7 +92,9 @@ object ColumnarOverrideRules { } } -case class ColumnarOverrideRules(session: SparkSession) +case class ColumnarOverrideRules( + session: SparkSession, + applierBuilder: SparkSession => ColumnarRuleApplier) extends ColumnarRule with Logging with LogLevelUtil { @@ -117,19 +116,11 @@ case class ColumnarOverrideRules(session: SparkSession) val outputsColumnar = OutputsColumnarTester.inferOutputsColumnar(plan) val unwrapped = OutputsColumnarTester.unwrap(plan) val vanillaPlan = Transitions.insertTransitions(unwrapped, outputsColumnar) - val applier: ColumnarRuleApplier = if (GlutenConfig.getConf.enableRas) { - new EnumeratedApplier(session) - } else { - new HeuristicApplier(session) - } + val applier = applierBuilder.apply(session) val out = applier.apply(vanillaPlan, outputsColumnar) out } } -object ColumnarOverrides extends GlutenSparkExtensionsInjector { - override def inject(extensions: SparkSessionExtensions): Unit = { - extensions.injectColumnar(spark => ColumnarOverrideRules(spark)) - } -} +object ColumnarOverrides {} diff --git a/gluten-core/src/main/scala/org/apache/gluten/extension/GlutenSessionExtensions.scala b/gluten-core/src/main/scala/org/apache/gluten/extension/GlutenSessionExtensions.scala new file mode 100644 index 0000000000000..4456dda615286 --- /dev/null +++ b/gluten-core/src/main/scala/org/apache/gluten/extension/GlutenSessionExtensions.scala @@ -0,0 +1,39 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.gluten.extension + +import org.apache.gluten.backendsapi.BackendsApiManager +import org.apache.gluten.extension.injector.RuleInjector + +import org.apache.spark.sql.SparkSessionExtensions +import org.apache.spark.sql.internal.StaticSQLConf + +import java.util.Objects + +private[gluten] class GlutenSessionExtensions extends (SparkSessionExtensions => Unit) { + override def apply(exts: SparkSessionExtensions): Unit = { + val injector = new RuleInjector() + BackendsApiManager.getRuleApiInstance.injectRules(injector) + injector.inject(exts) + } +} + +private[gluten] object GlutenSessionExtensions { + val SPARK_SESSION_EXTS_KEY: String = StaticSQLConf.SPARK_SESSION_EXTENSIONS.key + val GLUTEN_SESSION_EXTENSION_NAME: String = + Objects.requireNonNull(classOf[GlutenSessionExtensions].getCanonicalName) +} diff --git a/gluten-core/src/main/scala/org/apache/gluten/extension/OthersExtensionOverrides.scala b/gluten-core/src/main/scala/org/apache/gluten/extension/OthersExtensionOverrides.scala deleted file mode 100644 index f2ccf6e81ca17..0000000000000 --- a/gluten-core/src/main/scala/org/apache/gluten/extension/OthersExtensionOverrides.scala +++ /dev/null @@ -1,48 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.gluten.extension - -import org.apache.gluten.GlutenSparkExtensionsInjector -import org.apache.gluten.backendsapi.BackendsApiManager - -import org.apache.spark.sql.SparkSessionExtensions - -object OthersExtensionOverrides extends GlutenSparkExtensionsInjector { - override def inject(extensions: SparkSessionExtensions): Unit = { - BackendsApiManager.getSparkPlanExecApiInstance - .genInjectExtendedParser() - .foreach(extensions.injectParser) - BackendsApiManager.getSparkPlanExecApiInstance - .genExtendedAnalyzers() - .foreach(extensions.injectResolutionRule) - BackendsApiManager.getSparkPlanExecApiInstance - .genExtendedOptimizers() - .foreach(extensions.injectOptimizerRule) - BackendsApiManager.getSparkPlanExecApiInstance - .genExtendedDataSourceV2Strategies() - .foreach(extensions.injectPlannerStrategy) - BackendsApiManager.getSparkPlanExecApiInstance - .genExtendedStrategies() - .foreach(extensions.injectPlannerStrategy) - BackendsApiManager.getSparkPlanExecApiInstance - .genInjectedFunctions() - .foreach(extensions.injectFunction) - BackendsApiManager.getSparkPlanExecApiInstance - .genInjectPostHocResolutionRules() - .foreach(extensions.injectPostHocResolutionRule) - } -} diff --git a/gluten-core/src/main/scala/org/apache/gluten/extension/QueryStagePrepOverrides.scala b/gluten-core/src/main/scala/org/apache/gluten/extension/QueryStagePrepOverrides.scala deleted file mode 100644 index 8f9e2326ca717..0000000000000 --- a/gluten-core/src/main/scala/org/apache/gluten/extension/QueryStagePrepOverrides.scala +++ /dev/null @@ -1,50 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.gluten.extension - -import org.apache.gluten.GlutenSparkExtensionsInjector -import org.apache.gluten.backendsapi.BackendsApiManager - -import org.apache.spark.sql.{SparkSession, SparkSessionExtensions} -import org.apache.spark.sql.catalyst.rules.Rule -import org.apache.spark.sql.execution.SparkPlan - -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -object QueryStagePrepOverrides extends GlutenSparkExtensionsInjector { - private val RULES: Seq[SparkSession => Rule[SparkPlan]] = - BackendsApiManager.getSparkPlanExecApiInstance.genExtendedQueryStagePrepRules() - - override def inject(extensions: SparkSessionExtensions): Unit = { - RULES.foreach(extensions.injectQueryStagePrepRule) - } -} diff --git a/gluten-core/src/main/scala/org/apache/gluten/extension/columnar/ColumnarRuleApplier.scala b/gluten-core/src/main/scala/org/apache/gluten/extension/columnar/ColumnarRuleApplier.scala index ee5bcd883e7e7..9b78ccd11de27 100644 --- a/gluten-core/src/main/scala/org/apache/gluten/extension/columnar/ColumnarRuleApplier.scala +++ b/gluten-core/src/main/scala/org/apache/gluten/extension/columnar/ColumnarRuleApplier.scala @@ -17,11 +17,14 @@ package org.apache.gluten.extension.columnar import org.apache.gluten.GlutenConfig +import org.apache.gluten.extension.columnar.util.AdaptiveContext import org.apache.gluten.metrics.GlutenTimeMetric import org.apache.gluten.utils.LogLevelUtil import org.apache.spark.internal.Logging +import org.apache.spark.sql.SparkSession import org.apache.spark.sql.catalyst.rules.{Rule, RuleExecutor} +import org.apache.spark.sql.catalyst.util.sideBySide import org.apache.spark.sql.execution.SparkPlan trait ColumnarRuleApplier { @@ -29,6 +32,17 @@ trait ColumnarRuleApplier { } object ColumnarRuleApplier { + type ColumnarRuleBuilder = ColumnarRuleCall => Rule[SparkPlan] + + class ColumnarRuleCall( + val session: SparkSession, + val ac: AdaptiveContext, + val outputsColumnar: Boolean) { + val conf: GlutenConfig = { + new GlutenConfig(session.sessionState.conf) + } + } + class Executor(phase: String, rules: Seq[Rule[SparkPlan]]) extends RuleExecutor[SparkPlan] { private val batch: Batch = Batch(s"Columnar (Phase [$phase])", Once, rules.map(r => new LoggedRule(r)): _*) @@ -47,13 +61,19 @@ object ColumnarRuleApplier { private val transformPlanLogLevel = GlutenConfig.getConf.transformPlanLogLevel override val ruleName: String = delegate.ruleName - override def apply(plan: SparkPlan): SparkPlan = GlutenTimeMetric.withMillisTime { - logOnLevel( - transformPlanLogLevel, - s"Preparing to apply rule $ruleName on plan:\n${plan.toString}") - val out = delegate.apply(plan) - logOnLevel(transformPlanLogLevel, s"Plan after applied rule $ruleName:\n${plan.toString}") + private def message(oldPlan: SparkPlan, newPlan: SparkPlan, millisTime: Long): String = + if (!oldPlan.fastEquals(newPlan)) { + s""" + |=== Applying Rule $ruleName took $millisTime ms === + |${sideBySide(oldPlan.treeString, newPlan.treeString).mkString("\n")} + """.stripMargin + } else { s"Rule $ruleName has no effect, took $millisTime ms." } + + override def apply(plan: SparkPlan): SparkPlan = { + val (out, millisTime) = GlutenTimeMetric.recordMillisTime(delegate.apply(plan)) + logOnLevel(transformPlanLogLevel, message(plan, out, millisTime)) out - }(t => logOnLevel(transformPlanLogLevel, s"Applying rule $ruleName took $t ms.")) + } + } } diff --git a/gluten-core/src/main/scala/org/apache/gluten/extension/columnar/FallbackRules.scala b/gluten-core/src/main/scala/org/apache/gluten/extension/columnar/FallbackRules.scala index f9eaa4179c67b..6b043fbce269a 100644 --- a/gluten-core/src/main/scala/org/apache/gluten/extension/columnar/FallbackRules.scala +++ b/gluten-core/src/main/scala/org/apache/gluten/extension/columnar/FallbackRules.scala @@ -27,8 +27,6 @@ import org.apache.gluten.sql.shims.SparkShimLoader import org.apache.spark.api.python.EvalPythonExecTransformer import org.apache.spark.sql.SparkSession -import org.apache.spark.sql.catalyst.InternalRow -import org.apache.spark.sql.catalyst.expressions.AttributeReference import org.apache.spark.sql.catalyst.rules.Rule import org.apache.spark.sql.catalyst.trees.{TreeNode, TreeNodeTag} import org.apache.spark.sql.execution._ @@ -41,7 +39,6 @@ import org.apache.spark.sql.execution.joins._ import org.apache.spark.sql.execution.python.{ArrowEvalPythonExec, BatchEvalPythonExec} import org.apache.spark.sql.execution.window.{WindowExec, WindowGroupLimitExecShim} import org.apache.spark.sql.hive.HiveTableScanExecTransformer -import org.apache.spark.sql.types.StringType import org.apache.commons.lang3.exception.ExceptionUtils @@ -241,58 +238,6 @@ case class FallbackMultiCodegens(session: SparkSession) extends Rule[SparkPlan] } } -/** - * This rule plans [[RDDScanExec]] with a fake schema to make gluten work, because gluten does not - * support empty output relation, see [[FallbackEmptySchemaRelation]]. - */ -case class PlanOneRowRelation(spark: SparkSession) extends Rule[SparkPlan] { - override def apply(plan: SparkPlan): SparkPlan = { - if (!GlutenConfig.getConf.enableOneRowRelationColumnar) { - return plan - } - - plan.transform { - // We should make sure the output does not change, e.g. - // Window - // OneRowRelation - case u: UnaryExecNode - if u.child.isInstanceOf[RDDScanExec] && - u.child.asInstanceOf[RDDScanExec].name == "OneRowRelation" && - u.outputSet != u.child.outputSet => - val rdd = spark.sparkContext.parallelize(InternalRow(null) :: Nil, 1) - val attr = AttributeReference("fake_column", StringType)() - u.withNewChildren(RDDScanExec(attr :: Nil, rdd, "OneRowRelation") :: Nil) - } - } -} - -/** - * FIXME To be removed: Since Velox backend is the only one to use the strategy, and we already - * support offloading zero-column batch in ColumnarBatchInIterator via PR #3309. - * - * We'd make sure all Velox operators be able to handle zero-column input correctly then remove the - * rule together with [[PlanOneRowRelation]]. - */ -case class FallbackEmptySchemaRelation() extends Rule[SparkPlan] { - override def apply(plan: SparkPlan): SparkPlan = plan.transformDown { - case p => - if (BackendsApiManager.getSettings.fallbackOnEmptySchema(p)) { - if (p.children.exists(_.output.isEmpty)) { - // Some backends are not eligible to offload plan with zero-column input. - // If any child have empty output, mark the plan and that child as UNSUPPORTED. - FallbackTags.add(p, "at least one of its children has empty output") - p.children.foreach { - child => - if (child.output.isEmpty && !child.isInstanceOf[WriteFilesExec]) { - FallbackTags.add(child, "at least one of its children has empty output") - } - } - } - } - p - } -} - // This rule will try to convert a plan into plan transformer. // The doValidate function will be called to check if the conversion is supported. // If false is returned or any unsupported exception is thrown, a row guard will diff --git a/gluten-core/src/main/scala/org/apache/gluten/extension/columnar/OffloadSingleNode.scala b/gluten-core/src/main/scala/org/apache/gluten/extension/columnar/OffloadSingleNode.scala index 1d4e11d912c11..70b85165c37b4 100644 --- a/gluten-core/src/main/scala/org/apache/gluten/extension/columnar/OffloadSingleNode.scala +++ b/gluten-core/src/main/scala/org/apache/gluten/extension/columnar/OffloadSingleNode.scala @@ -205,7 +205,13 @@ object OffloadJoin { case Some(join: Join) => val leftSize = join.left.stats.sizeInBytes val rightSize = join.right.stats.sizeInBytes - if (rightSize <= leftSize) BuildRight else BuildLeft + val leftRowCount = join.left.stats.rowCount + val rightRowCount = join.right.stats.rowCount + if (rightSize == leftSize && rightRowCount.isDefined && leftRowCount.isDefined) { + if (rightRowCount.get <= leftRowCount.get) BuildRight + else BuildLeft + } else if (rightSize <= leftSize) BuildRight + else BuildLeft // Only the ShuffledHashJoinExec generated directly in some spark tests is not link // logical plan, such as OuterJoinSuite. case _ => shj.buildSide @@ -457,7 +463,8 @@ object OffloadOthers { plan.bucketSpec, plan.options, plan.staticPartitions) - BackendsApiManager.getSparkPlanExecApiInstance.createColumnarWriteFilesExec( + + ColumnarWriteFilesExec( writeTransformer, plan.fileFormat, plan.partitionColumns, diff --git a/gluten-core/src/main/scala/org/apache/gluten/extension/columnar/enumerated/EnumeratedApplier.scala b/gluten-core/src/main/scala/org/apache/gluten/extension/columnar/enumerated/EnumeratedApplier.scala index 5cf3961c548ba..bebce3a61ae89 100644 --- a/gluten-core/src/main/scala/org/apache/gluten/extension/columnar/enumerated/EnumeratedApplier.scala +++ b/gluten-core/src/main/scala/org/apache/gluten/extension/columnar/enumerated/EnumeratedApplier.scala @@ -16,11 +16,8 @@ */ package org.apache.gluten.extension.columnar.enumerated -import org.apache.gluten.GlutenConfig -import org.apache.gluten.backendsapi.BackendsApiManager import org.apache.gluten.extension.columnar._ -import org.apache.gluten.extension.columnar.MiscColumnarRules.{RemoveGlutenTableCacheColumnarToRow, RemoveTopmostColumnarToRow, RewriteSubqueryBroadcast} -import org.apache.gluten.extension.columnar.transition.{InsertTransitions, RemoveTransitions} +import org.apache.gluten.extension.columnar.ColumnarRuleApplier.{ColumnarRuleBuilder, ColumnarRuleCall} import org.apache.gluten.extension.columnar.util.AdaptiveContext import org.apache.gluten.utils.{LogLevelUtil, PhysicalPlanSelector} @@ -28,8 +25,7 @@ import org.apache.spark.annotation.Experimental import org.apache.spark.internal.Logging import org.apache.spark.sql.SparkSession import org.apache.spark.sql.catalyst.rules.Rule -import org.apache.spark.sql.execution.{ColumnarCollapseTransformStages, GlutenFallbackReporter, SparkPlan} -import org.apache.spark.util.SparkRuleUtil +import org.apache.spark.sql.execution.SparkPlan /** * Columnar rule applier that optimizes, implements Spark plan into Gluten plan by enumerating on @@ -40,7 +36,7 @@ import org.apache.spark.util.SparkRuleUtil * implementing them in EnumeratedTransform. */ @Experimental -class EnumeratedApplier(session: SparkSession) +class EnumeratedApplier(session: SparkSession, ruleBuilders: Seq[ColumnarRuleBuilder]) extends ColumnarRuleApplier with Logging with LogLevelUtil { @@ -53,22 +49,18 @@ class EnumeratedApplier(session: SparkSession) } private val adaptiveContext = AdaptiveContext(session, aqeStackTraceIndex) - override def apply(plan: SparkPlan, outputsColumnar: Boolean): SparkPlan = + override def apply(plan: SparkPlan, outputsColumnar: Boolean): SparkPlan = { + val call = new ColumnarRuleCall(session, adaptiveContext, outputsColumnar) PhysicalPlanSelector.maybe(session, plan) { - val transformed = - transformPlan("transform", transformRules(outputsColumnar).map(_(session)), plan) - val postPlan = maybeAqe { - transformPlan("post", postRules().map(_(session)), transformed) + val finalPlan = maybeAqe { + apply0(ruleBuilders.map(b => b(call)), plan) } - val finalPlan = transformPlan("final", finalRules().map(_(session)), postPlan) finalPlan } + } - private def transformPlan( - phase: String, - rules: Seq[Rule[SparkPlan]], - plan: SparkPlan): SparkPlan = { - val executor = new ColumnarRuleApplier.Executor(phase, rules) + private def apply0(rules: Seq[Rule[SparkPlan]], plan: SparkPlan): SparkPlan = { + val executor = new ColumnarRuleApplier.Executor("ras", rules) executor.execute(plan) } @@ -80,61 +72,4 @@ class EnumeratedApplier(session: SparkSession) adaptiveContext.resetAdaptiveContext() } } - - /** - * Rules to let planner create a suggested Gluten plan being sent to `fallbackPolicies` in which - * the plan will be breakdown and decided to be fallen back or not. - */ - private def transformRules(outputsColumnar: Boolean): Seq[SparkSession => Rule[SparkPlan]] = { - List( - (_: SparkSession) => RemoveTransitions, - (spark: SparkSession) => FallbackOnANSIMode(spark), - (spark: SparkSession) => PlanOneRowRelation(spark), - (_: SparkSession) => FallbackEmptySchemaRelation(), - (_: SparkSession) => RewriteSubqueryBroadcast() - ) ::: - BackendsApiManager.getSparkPlanExecApiInstance.genExtendedColumnarValidationRules() ::: - List((spark: SparkSession) => MergeTwoPhasesHashBaseAggregate(spark)) ::: - List( - (session: SparkSession) => EnumeratedTransform(session, outputsColumnar), - (_: SparkSession) => RemoveTransitions - ) ::: - List( - (_: SparkSession) => RemoveNativeWriteFilesSortAndProject(), - (spark: SparkSession) => RewriteTransformer(spark), - (_: SparkSession) => EnsureLocalSortRequirements, - (_: SparkSession) => EliminateLocalSort, - (_: SparkSession) => CollapseProjectExecTransformer - ) ::: - BackendsApiManager.getSparkPlanExecApiInstance.genExtendedColumnarTransformRules() ::: - SparkRuleUtil - .extendedColumnarRules(session, GlutenConfig.getConf.extendedColumnarTransformRules) ::: - List((_: SparkSession) => InsertTransitions(outputsColumnar)) - } - - /** - * Rules applying to non-fallen-back Gluten plans. To do some post cleanup works on the plan to - * make sure it be able to run and be compatible with Spark's execution engine. - */ - private def postRules(): Seq[SparkSession => Rule[SparkPlan]] = - List( - (s: SparkSession) => RemoveTopmostColumnarToRow(s, adaptiveContext.isAdaptiveContext())) ::: - BackendsApiManager.getSparkPlanExecApiInstance.genExtendedColumnarPostRules() ::: - List((_: SparkSession) => ColumnarCollapseTransformStages(GlutenConfig.getConf)) ::: - SparkRuleUtil.extendedColumnarRules(session, GlutenConfig.getConf.extendedColumnarPostRules) - - /* - * Rules consistently applying to all input plans after all other rules have been applied, despite - * whether the input plan is fallen back or not. - */ - private def finalRules(): Seq[SparkSession => Rule[SparkPlan]] = { - List( - // The rule is required despite whether the stage is fallen back or not. Since - // ColumnarCachedBatchSerializer is statically registered to Spark without a columnar rule - // when columnar table cache is enabled. - (s: SparkSession) => RemoveGlutenTableCacheColumnarToRow(s), - (s: SparkSession) => GlutenFallbackReporter(GlutenConfig.getConf, s), - (_: SparkSession) => RemoveFallbackTagRule() - ) - } } diff --git a/gluten-core/src/main/scala/org/apache/gluten/extension/columnar/enumerated/EnumeratedTransform.scala b/gluten-core/src/main/scala/org/apache/gluten/extension/columnar/enumerated/EnumeratedTransform.scala index 4b747eb70075b..007f18fca40bd 100644 --- a/gluten-core/src/main/scala/org/apache/gluten/extension/columnar/enumerated/EnumeratedTransform.scala +++ b/gluten-core/src/main/scala/org/apache/gluten/extension/columnar/enumerated/EnumeratedTransform.scala @@ -18,6 +18,7 @@ package org.apache.gluten.extension.columnar.enumerated import org.apache.gluten.extension.columnar.{OffloadExchange, OffloadJoin, OffloadOthers} import org.apache.gluten.extension.columnar.transition.ConventionReq +import org.apache.gluten.extension.columnar.validator.{Validator, Validators} import org.apache.gluten.planner.GlutenOptimization import org.apache.gluten.planner.cost.GlutenCostModel import org.apache.gluten.planner.property.Conv @@ -41,45 +42,54 @@ case class EnumeratedTransform(session: SparkSession, outputsColumnar: Boolean) extends Rule[SparkPlan] with LogLevelUtil { + private val validator: Validator = Validators + .builder() + .fallbackByHint() + .fallbackIfScanOnly() + .fallbackComplexExpressions() + .fallbackByBackendSettings() + .fallbackByUserOptions() + .fallbackByTestInjects() + .build() + private val rules = List( - new PushFilterToScan(RasOffload.validator), + new PushFilterToScan(validator), RemoveSort, RemoveFilter ) // TODO: Should obey ReplaceSingleNode#applyScanNotTransformable to select // (vanilla) scan with cheaper sub-query plan through cost model. - private val offloadRules = List( - RasOffload.from[Exchange](OffloadExchange()).toRule, - RasOffload.from[BaseJoinExec](OffloadJoin()).toRule, - RasOffloadHashAggregate.toRule, - RasOffloadFilter.toRule, - RasOffloadProject.toRule, - RasOffload.from[DataSourceV2ScanExecBase](OffloadOthers()).toRule, - RasOffload.from[DataSourceScanExec](OffloadOthers()).toRule, - RasOffload - .from( - (node: SparkPlan) => HiveTableScanExecTransformer.isHiveTableScan(node), - OffloadOthers()) - .toRule, - RasOffload.from[CoalesceExec](OffloadOthers()).toRule, - RasOffload.from[SortAggregateExec](OffloadOthers()).toRule, - RasOffload.from[ObjectHashAggregateExec](OffloadOthers()).toRule, - RasOffload.from[UnionExec](OffloadOthers()).toRule, - RasOffload.from[ExpandExec](OffloadOthers()).toRule, - RasOffload.from[WriteFilesExec](OffloadOthers()).toRule, - RasOffload.from[SortExec](OffloadOthers()).toRule, - RasOffload.from[TakeOrderedAndProjectExec](OffloadOthers()).toRule, - RasOffload.from[WindowExec](OffloadOthers()).toRule, - RasOffload - .from( - (node: SparkPlan) => SparkShimLoader.getSparkShims.isWindowGroupLimitExec(node), - OffloadOthers()) - .toRule, - RasOffload.from[LimitExec](OffloadOthers()).toRule, - RasOffload.from[GenerateExec](OffloadOthers()).toRule, - RasOffload.from[EvalPythonExec](OffloadOthers()).toRule - ) + private val offloadRules = + Seq( + RasOffload.from[Exchange](OffloadExchange()), + RasOffload.from[BaseJoinExec](OffloadJoin()), + RasOffloadHashAggregate, + RasOffloadFilter, + RasOffloadProject, + RasOffload.from[DataSourceV2ScanExecBase](OffloadOthers()), + RasOffload.from[DataSourceScanExec](OffloadOthers()), + RasOffload + .from( + (node: SparkPlan) => HiveTableScanExecTransformer.isHiveTableScan(node), + OffloadOthers()), + RasOffload.from[CoalesceExec](OffloadOthers()), + RasOffload.from[SortAggregateExec](OffloadOthers()), + RasOffload.from[ObjectHashAggregateExec](OffloadOthers()), + RasOffload.from[UnionExec](OffloadOthers()), + RasOffload.from[ExpandExec](OffloadOthers()), + RasOffload.from[WriteFilesExec](OffloadOthers()), + RasOffload.from[SortExec](OffloadOthers()), + RasOffload.from[TakeOrderedAndProjectExec](OffloadOthers()), + RasOffload.from[WindowExec](OffloadOthers()), + RasOffload + .from( + (node: SparkPlan) => SparkShimLoader.getSparkShims.isWindowGroupLimitExec(node), + OffloadOthers()), + RasOffload.from[LimitExec](OffloadOthers()), + RasOffload.from[GenerateExec](OffloadOthers()), + RasOffload.from[EvalPythonExec](OffloadOthers()) + ).map(RasOffload.Rule(_, validator)) private val optimization = { GlutenOptimization diff --git a/gluten-core/src/main/scala/org/apache/gluten/extension/columnar/enumerated/RasOffload.scala b/gluten-core/src/main/scala/org/apache/gluten/extension/columnar/enumerated/RasOffload.scala index 43f52a9e4758e..fc29b36f08d8f 100644 --- a/gluten-core/src/main/scala/org/apache/gluten/extension/columnar/enumerated/RasOffload.scala +++ b/gluten-core/src/main/scala/org/apache/gluten/extension/columnar/enumerated/RasOffload.scala @@ -19,7 +19,7 @@ package org.apache.gluten.extension.columnar.enumerated import org.apache.gluten.extension.GlutenPlan import org.apache.gluten.extension.columnar.OffloadSingleNode import org.apache.gluten.extension.columnar.rewrite.RewriteSingleNode -import org.apache.gluten.extension.columnar.validator.{Validator, Validators} +import org.apache.gluten.extension.columnar.validator.Validator import org.apache.gluten.ras.path.Pattern import org.apache.gluten.ras.path.Pattern.node import org.apache.gluten.ras.rule.{RasRule, Shape} @@ -49,16 +49,6 @@ object RasOffload { } } - val validator: Validator = Validators - .builder() - .fallbackByHint() - .fallbackIfScanOnly() - .fallbackComplexExpressions() - .fallbackByBackendSettings() - .fallbackByUserOptions() - .fallbackByTestInjects() - .build() - private val rewrites = RewriteSingleNode.allRules() def from[T <: SparkPlan: ClassTag](base: OffloadSingleNode): RasOffload = { @@ -75,70 +65,70 @@ object RasOffload { } } - implicit class RasOffloadOps(base: RasOffload) { - def toRule: RasRule[SparkPlan] = { - new RuleImpl(base) + object Rule { + def apply(base: RasOffload, validator: Validator): RasRule[SparkPlan] = { + new RuleImpl(base, validator) } - } - - private class RuleImpl(base: RasOffload) extends RasRule[SparkPlan] { - private val typeIdentifier: TypeIdentifier = base.typeIdentifier() - final override def shift(node: SparkPlan): Iterable[SparkPlan] = { - // 0. If the node is already offloaded, fail fast. - assert(typeIdentifier.isInstance(node)) - - // 1. Rewrite the node to form that native library supports. - val rewritten = rewrites.foldLeft(node) { - case (node, rewrite) => - node.transformUp { - case p => - val out = rewrite.rewrite(p) - out - } - } + private class RuleImpl(base: RasOffload, validator: Validator) extends RasRule[SparkPlan] { + private val typeIdentifier: TypeIdentifier = base.typeIdentifier() + + final override def shift(node: SparkPlan): Iterable[SparkPlan] = { + // 0. If the node is already offloaded, fail fast. + assert(typeIdentifier.isInstance(node)) + + // 1. Rewrite the node to form that native library supports. + val rewritten = rewrites.foldLeft(node) { + case (node, rewrite) => + node.transformUp { + case p => + val out = rewrite.rewrite(p) + out + } + } - // 2. Walk the rewritten tree. - val offloaded = rewritten.transformUp { - case from if typeIdentifier.isInstance(from) => - // 3. Validate current node. If passed, offload it. - validator.validate(from) match { - case Validator.Passed => - val offloaded = base.offload(from) - val offloadedNodes = offloaded.collect[GlutenPlan] { case t: GlutenPlan => t } - if (offloadedNodes.exists(!_.doValidate().ok())) { - // 4. If native validation fails on the offloaded node, return the - // original one. + // 2. Walk the rewritten tree. + val offloaded = rewritten.transformUp { + case from if typeIdentifier.isInstance(from) => + // 3. Validate current node. If passed, offload it. + validator.validate(from) match { + case Validator.Passed => + val offloaded = base.offload(from) + val offloadedNodes = offloaded.collect[GlutenPlan] { case t: GlutenPlan => t } + if (offloadedNodes.exists(!_.doValidate().ok())) { + // 4. If native validation fails on the offloaded node, return the + // original one. + from + } else { + offloaded + } + case Validator.Failed(reason) => from - } else { - offloaded - } - case Validator.Failed(reason) => - from - } - } + } + } - // 5. If rewritten plan is not offload-able, discard it. - if (offloaded.fastEquals(rewritten)) { - return List.empty - } + // 5. If rewritten plan is not offload-able, discard it. + if (offloaded.fastEquals(rewritten)) { + return List.empty + } - // 6. Otherwise, return the final tree. - List(offloaded) - } + // 6. Otherwise, return the final tree. + List(offloaded) + } - override def shape(): Shape[SparkPlan] = { - pattern(node[SparkPlan](new Pattern.Matcher[SparkPlan] { - override def apply(plan: SparkPlan): Boolean = { - if (plan.isInstanceOf[GlutenPlan]) { - return false - } - if (typeIdentifier.isInstance(plan)) { - return true + override def shape(): Shape[SparkPlan] = { + pattern(node[SparkPlan](new Pattern.Matcher[SparkPlan] { + override def apply(plan: SparkPlan): Boolean = { + if (plan.isInstanceOf[GlutenPlan]) { + return false + } + if (typeIdentifier.isInstance(plan)) { + return true + } + false } - false - } - }).build()) + }).build()) + } } } } diff --git a/gluten-core/src/main/scala/org/apache/gluten/extension/columnar/heuristic/HeuristicApplier.scala b/gluten-core/src/main/scala/org/apache/gluten/extension/columnar/heuristic/HeuristicApplier.scala index f776a1dcc3cdf..dea9f01df2a54 100644 --- a/gluten-core/src/main/scala/org/apache/gluten/extension/columnar/heuristic/HeuristicApplier.scala +++ b/gluten-core/src/main/scala/org/apache/gluten/extension/columnar/heuristic/HeuristicApplier.scala @@ -16,26 +16,26 @@ */ package org.apache.gluten.extension.columnar.heuristic -import org.apache.gluten.GlutenConfig -import org.apache.gluten.backendsapi.BackendsApiManager import org.apache.gluten.extension.columnar._ -import org.apache.gluten.extension.columnar.MiscColumnarRules.{RemoveGlutenTableCacheColumnarToRow, RemoveTopmostColumnarToRow, RewriteSubqueryBroadcast, TransformPreOverrides} -import org.apache.gluten.extension.columnar.rewrite.RewriteSparkPlanRulesManager -import org.apache.gluten.extension.columnar.transition.{InsertTransitions, RemoveTransitions} +import org.apache.gluten.extension.columnar.ColumnarRuleApplier.{ColumnarRuleBuilder, ColumnarRuleCall} import org.apache.gluten.extension.columnar.util.AdaptiveContext import org.apache.gluten.utils.{LogLevelUtil, PhysicalPlanSelector} import org.apache.spark.internal.Logging import org.apache.spark.sql.SparkSession import org.apache.spark.sql.catalyst.rules.Rule -import org.apache.spark.sql.execution.{ColumnarCollapseTransformStages, GlutenFallbackReporter, SparkPlan} -import org.apache.spark.util.SparkRuleUtil +import org.apache.spark.sql.execution.SparkPlan /** * Columnar rule applier that optimizes, implements Spark plan into Gluten plan by heuristically * applying columnar rules in fixed order. */ -class HeuristicApplier(session: SparkSession) +class HeuristicApplier( + session: SparkSession, + transformBuilders: Seq[ColumnarRuleBuilder], + fallbackPolicyBuilders: Seq[ColumnarRuleBuilder], + postBuilders: Seq[ColumnarRuleBuilder], + finalBuilders: Seq[ColumnarRuleBuilder]) extends ColumnarRuleApplier with Logging with LogLevelUtil { @@ -49,27 +49,27 @@ class HeuristicApplier(session: SparkSession) private val adaptiveContext = AdaptiveContext(session, aqeStackTraceIndex) override def apply(plan: SparkPlan, outputsColumnar: Boolean): SparkPlan = { - withTransformRules(transformRules(outputsColumnar)).apply(plan) + val call = new ColumnarRuleCall(session, adaptiveContext, outputsColumnar) + makeRule(call).apply(plan) } - // Visible for testing. - def withTransformRules(transformRules: Seq[SparkSession => Rule[SparkPlan]]): Rule[SparkPlan] = + private def makeRule(call: ColumnarRuleCall): Rule[SparkPlan] = plan => PhysicalPlanSelector.maybe(session, plan) { val finalPlan = prepareFallback(plan) { p => - val suggestedPlan = transformPlan("transform", transformRules.map(_(session)), p) - transformPlan("fallback", fallbackPolicies().map(_(session)), suggestedPlan) match { + val suggestedPlan = transformPlan("transform", transformRules(call), p) + transformPlan("fallback", fallbackPolicies(call), suggestedPlan) match { case FallbackNode(fallbackPlan) => // we should use vanilla c2r rather than native c2r, // and there should be no `GlutenPlan` any more, // so skip the `postRules()`. fallbackPlan case plan => - transformPlan("post", postRules().map(_(session)), plan) + transformPlan("post", postRules(call), plan) } } - transformPlan("final", finalRules().map(_(session)), finalPlan) + transformPlan("final", finalRules(call), finalPlan) } private def transformPlan( @@ -95,69 +95,32 @@ class HeuristicApplier(session: SparkSession) * Rules to let planner create a suggested Gluten plan being sent to `fallbackPolicies` in which * the plan will be breakdown and decided to be fallen back or not. */ - private def transformRules(outputsColumnar: Boolean): Seq[SparkSession => Rule[SparkPlan]] = { - List( - (_: SparkSession) => RemoveTransitions, - (spark: SparkSession) => FallbackOnANSIMode(spark), - (spark: SparkSession) => FallbackMultiCodegens(spark), - (spark: SparkSession) => PlanOneRowRelation(spark), - (_: SparkSession) => RewriteSubqueryBroadcast() - ) ::: - BackendsApiManager.getSparkPlanExecApiInstance.genExtendedColumnarValidationRules() ::: - List( - (_: SparkSession) => FallbackEmptySchemaRelation(), - (spark: SparkSession) => MergeTwoPhasesHashBaseAggregate(spark), - (_: SparkSession) => RewriteSparkPlanRulesManager(), - (_: SparkSession) => AddFallbackTagRule() - ) ::: - List((_: SparkSession) => TransformPreOverrides()) ::: - List( - (_: SparkSession) => RemoveNativeWriteFilesSortAndProject(), - (spark: SparkSession) => RewriteTransformer(spark), - (_: SparkSession) => EnsureLocalSortRequirements, - (_: SparkSession) => EliminateLocalSort, - (_: SparkSession) => CollapseProjectExecTransformer - ) ::: - BackendsApiManager.getSparkPlanExecApiInstance.genExtendedColumnarTransformRules() ::: - SparkRuleUtil - .extendedColumnarRules(session, GlutenConfig.getConf.extendedColumnarTransformRules) ::: - List((_: SparkSession) => InsertTransitions(outputsColumnar)) + private def transformRules(call: ColumnarRuleCall): Seq[Rule[SparkPlan]] = { + transformBuilders.map(b => b.apply(call)) } /** * Rules to add wrapper `FallbackNode`s on top of the input plan, as hints to make planner fall * back the whole input plan to the original vanilla Spark plan. */ - private def fallbackPolicies(): Seq[SparkSession => Rule[SparkPlan]] = { - List( - (_: SparkSession) => - ExpandFallbackPolicy(adaptiveContext.isAdaptiveContext(), adaptiveContext.originalPlan())) + private def fallbackPolicies(call: ColumnarRuleCall): Seq[Rule[SparkPlan]] = { + fallbackPolicyBuilders.map(b => b.apply(call)) } /** * Rules applying to non-fallen-back Gluten plans. To do some post cleanup works on the plan to * make sure it be able to run and be compatible with Spark's execution engine. */ - private def postRules(): Seq[SparkSession => Rule[SparkPlan]] = - List( - (s: SparkSession) => RemoveTopmostColumnarToRow(s, adaptiveContext.isAdaptiveContext())) ::: - BackendsApiManager.getSparkPlanExecApiInstance.genExtendedColumnarPostRules() ::: - List((_: SparkSession) => ColumnarCollapseTransformStages(GlutenConfig.getConf)) ::: - SparkRuleUtil.extendedColumnarRules(session, GlutenConfig.getConf.extendedColumnarPostRules) + private def postRules(call: ColumnarRuleCall): Seq[Rule[SparkPlan]] = { + postBuilders.map(b => b.apply(call)) + } /* * Rules consistently applying to all input plans after all other rules have been applied, despite * whether the input plan is fallen back or not. */ - private def finalRules(): Seq[SparkSession => Rule[SparkPlan]] = { - List( - // The rule is required despite whether the stage is fallen back or not. Since - // ColumnarCachedBatchSerializer is statically registered to Spark without a columnar rule - // when columnar table cache is enabled. - (s: SparkSession) => RemoveGlutenTableCacheColumnarToRow(s), - (s: SparkSession) => GlutenFallbackReporter(GlutenConfig.getConf, s), - (_: SparkSession) => RemoveFallbackTagRule() - ) + private def finalRules(call: ColumnarRuleCall): Seq[Rule[SparkPlan]] = { + finalBuilders.map(b => b.apply(call)) } // Just for test use. @@ -166,3 +129,5 @@ class HeuristicApplier(session: SparkSession) this } } + +object HeuristicApplier {} diff --git a/gluten-core/src/main/scala/org/apache/gluten/extension/columnar/transition/ConventionFunc.scala b/gluten-core/src/main/scala/org/apache/gluten/extension/columnar/transition/ConventionFunc.scala index 453df5d88135c..beb80947409ab 100644 --- a/gluten-core/src/main/scala/org/apache/gluten/extension/columnar/transition/ConventionFunc.scala +++ b/gluten-core/src/main/scala/org/apache/gluten/extension/columnar/transition/ConventionFunc.scala @@ -101,14 +101,14 @@ object ConventionFunc { val out = plan match { case k: Convention.KnownRowType => k.rowType() - case _ if !SparkShimLoader.getSparkShims.supportsRowBased(plan) => - Convention.RowType.None - case _ => + case _ if SparkShimLoader.getSparkShims.supportsRowBased(plan) => Convention.RowType.VanillaRow + case _ => + Convention.RowType.None } - if (out != Convention.RowType.None) { - assert(SparkShimLoader.getSparkShims.supportsRowBased(plan)) - } + assert( + out == Convention.RowType.None || plan.isInstanceOf[Convention.KnownRowType] || + SparkShimLoader.getSparkShims.supportsRowBased(plan)) out } @@ -119,15 +119,13 @@ object ConventionFunc { p match { case k: Convention.KnownBatchType => k.batchType() - case _ if !plan.supportsColumnar => - Convention.BatchType.None - case _ => + case _ if plan.supportsColumnar => Convention.BatchType.VanillaBatch + case _ => + Convention.BatchType.None } ) - if (out != Convention.BatchType.None) { - assert(plan.supportsColumnar) - } + assert(out == Convention.BatchType.None || plan.supportsColumnar) out } diff --git a/gluten-core/src/main/scala/org/apache/gluten/extension/columnar/transition/Transitions.scala b/gluten-core/src/main/scala/org/apache/gluten/extension/columnar/transition/Transitions.scala index d02aadd493d45..602f0303c9095 100644 --- a/gluten-core/src/main/scala/org/apache/gluten/extension/columnar/transition/Transitions.scala +++ b/gluten-core/src/main/scala/org/apache/gluten/extension/columnar/transition/Transitions.scala @@ -77,22 +77,15 @@ object RemoveTransitions extends Rule[SparkPlan] { object Transitions { def insertTransitions(plan: SparkPlan, outputsColumnar: Boolean): SparkPlan = { - val out = InsertTransitions(outputsColumnar).apply(plan) - out + InsertTransitions(outputsColumnar).apply(plan) } def toRowPlan(plan: SparkPlan): SparkPlan = { - val convFunc = ConventionFunc.create() - val req = ConventionReq.of( - ConventionReq.RowType.Is(Convention.RowType.VanillaRow), - ConventionReq.BatchType.Any) - val removed = RemoveTransitions.removeForNode(plan) - val transition = Transition.factory.findTransition( - convFunc.conventionOf(removed), - req, - Transition.notFound(removed, req)) - val out = transition.apply(removed) - out + enforceReq( + plan, + ConventionReq.of( + ConventionReq.RowType.Is(Convention.RowType.VanillaRow), + ConventionReq.BatchType.Any)) } def toBackendBatchPlan(plan: SparkPlan): SparkPlan = { @@ -107,8 +100,13 @@ object Transitions { } private def toBatchPlan(plan: SparkPlan, toBatchType: Convention.BatchType): SparkPlan = { + enforceReq( + plan, + ConventionReq.of(ConventionReq.RowType.Any, ConventionReq.BatchType.Is(toBatchType))) + } + + private def enforceReq(plan: SparkPlan, req: ConventionReq): SparkPlan = { val convFunc = ConventionFunc.create() - val req = ConventionReq.of(ConventionReq.RowType.Any, ConventionReq.BatchType.Is(toBatchType)) val removed = RemoveTransitions.removeForNode(plan) val transition = Transition.factory.findTransition( convFunc.conventionOf(removed), diff --git a/gluten-core/src/main/scala/org/apache/gluten/extension/columnar/util/AdaptiveContext.scala b/gluten-core/src/main/scala/org/apache/gluten/extension/columnar/util/AdaptiveContext.scala index 4a9d69f8f0b19..e1f594fd36e50 100644 --- a/gluten-core/src/main/scala/org/apache/gluten/extension/columnar/util/AdaptiveContext.scala +++ b/gluten-core/src/main/scala/org/apache/gluten/extension/columnar/util/AdaptiveContext.scala @@ -22,6 +22,7 @@ import org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanExec import scala.collection.mutable.ListBuffer +// Since: https://github.com/apache/incubator-gluten/pull/3294. sealed trait AdaptiveContext { def enableAdaptiveContext(): Unit def isAdaptiveContext(): Boolean diff --git a/gluten-core/src/main/scala/org/apache/gluten/extension/columnar/validator/Validators.scala b/gluten-core/src/main/scala/org/apache/gluten/extension/columnar/validator/Validators.scala index a85cb163ceaa7..f1cb4792383bb 100644 --- a/gluten-core/src/main/scala/org/apache/gluten/extension/columnar/validator/Validators.scala +++ b/gluten-core/src/main/scala/org/apache/gluten/extension/columnar/validator/Validators.scala @@ -143,8 +143,6 @@ object Validators { case p: SortMergeJoinExec if !settings.supportSortMergeJoinExec() => fail(p) case p: WriteFilesExec if !settings.enableNativeWriteFiles() => fail(p) - case p: SortAggregateExec if !settings.replaceSortAggWithHashAgg => - fail(p) case p: CartesianProductExec if !settings.supportCartesianProductExec() => fail(p) case p: TakeOrderedAndProjectExec if !settings.supportColumnarShuffleExec() => fail(p) case _ => pass() @@ -162,6 +160,7 @@ object Validators { case p: FilterExec if !conf.enableColumnarFilter => fail(p) case p: UnionExec if !conf.enableColumnarUnion => fail(p) case p: ExpandExec if !conf.enableColumnarExpand => fail(p) + case p: SortAggregateExec if !conf.forceToUseHashAgg => fail(p) case p: ShuffledHashJoinExec if !conf.enableColumnarShuffledHashJoin => fail(p) case p: ShuffleExchangeExec if !conf.enableColumnarShuffle => fail(p) case p: BroadcastExchangeExec if !conf.enableColumnarBroadcastExchange => fail(p) diff --git a/gluten-core/src/main/scala/org/apache/gluten/extension/injector/GlutenInjector.scala b/gluten-core/src/main/scala/org/apache/gluten/extension/injector/GlutenInjector.scala new file mode 100644 index 0000000000000..728e569cc4eba --- /dev/null +++ b/gluten-core/src/main/scala/org/apache/gluten/extension/injector/GlutenInjector.scala @@ -0,0 +1,94 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.gluten.extension.injector + +import org.apache.gluten.GlutenConfig +import org.apache.gluten.extension.ColumnarOverrideRules +import org.apache.gluten.extension.columnar.ColumnarRuleApplier +import org.apache.gluten.extension.columnar.ColumnarRuleApplier.ColumnarRuleBuilder +import org.apache.gluten.extension.columnar.enumerated.EnumeratedApplier +import org.apache.gluten.extension.columnar.heuristic.HeuristicApplier + +import org.apache.spark.sql.{SparkSession, SparkSessionExtensions} + +import scala.collection.mutable + +/** Injector used to inject query planner rules into Gluten. */ +class GlutenInjector private[injector] { + import GlutenInjector._ + val legacy: LegacyInjector = new LegacyInjector() + val ras: RasInjector = new RasInjector() + + private[injector] def inject(extensions: SparkSessionExtensions): Unit = { + val ruleBuilder = (session: SparkSession) => new ColumnarOverrideRules(session, applier) + extensions.injectColumnar(session => ruleBuilder(session)) + } + + private def applier(session: SparkSession): ColumnarRuleApplier = { + val conf = new GlutenConfig(session.sessionState.conf) + if (conf.enableRas) { + return ras.createApplier(session) + } + legacy.createApplier(session) + } +} + +object GlutenInjector { + class LegacyInjector { + private val transformBuilders = mutable.Buffer.empty[ColumnarRuleBuilder] + private val fallbackPolicyBuilders = mutable.Buffer.empty[ColumnarRuleBuilder] + private val postBuilders = mutable.Buffer.empty[ColumnarRuleBuilder] + private val finalBuilders = mutable.Buffer.empty[ColumnarRuleBuilder] + + def injectTransform(builder: ColumnarRuleBuilder): Unit = { + transformBuilders += builder + } + + def injectFallbackPolicy(builder: ColumnarRuleBuilder): Unit = { + fallbackPolicyBuilders += builder + } + + def injectPost(builder: ColumnarRuleBuilder): Unit = { + postBuilders += builder + } + + def injectFinal(builder: ColumnarRuleBuilder): Unit = { + finalBuilders += builder + } + + private[injector] def createApplier(session: SparkSession): ColumnarRuleApplier = { + new HeuristicApplier( + session, + transformBuilders.toSeq, + fallbackPolicyBuilders.toSeq, + postBuilders.toSeq, + finalBuilders.toSeq) + } + } + + class RasInjector { + private val ruleBuilders = mutable.Buffer.empty[ColumnarRuleBuilder] + + def inject(builder: ColumnarRuleBuilder): Unit = { + ruleBuilders += builder + } + + private[injector] def createApplier(session: SparkSession): ColumnarRuleApplier = { + new EnumeratedApplier(session, ruleBuilders.toSeq) + } + } +} diff --git a/gluten-core/src/main/scala/org/apache/gluten/extension/injector/RuleInjector.scala b/gluten-core/src/main/scala/org/apache/gluten/extension/injector/RuleInjector.scala new file mode 100644 index 0000000000000..bccbd38b26d54 --- /dev/null +++ b/gluten-core/src/main/scala/org/apache/gluten/extension/injector/RuleInjector.scala @@ -0,0 +1,32 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.gluten.extension.injector + +import org.apache.spark.sql.SparkSessionExtensions + +/** Injector used to inject query planner rules into Spark and Gluten. */ +class RuleInjector { + val spark: SparkInjector = new SparkInjector() + val gluten: GlutenInjector = new GlutenInjector() + + private[extension] def inject(extensions: SparkSessionExtensions): Unit = { + spark.inject(extensions) + gluten.inject(extensions) + } +} + +object RuleInjector {} diff --git a/gluten-core/src/main/scala/org/apache/gluten/extension/injector/SparkInjector.scala b/gluten-core/src/main/scala/org/apache/gluten/extension/injector/SparkInjector.scala new file mode 100644 index 0000000000000..6935e61bdd5ba --- /dev/null +++ b/gluten-core/src/main/scala/org/apache/gluten/extension/injector/SparkInjector.scala @@ -0,0 +1,83 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.gluten.extension.injector + +import org.apache.spark.sql.{SparkSession, SparkSessionExtensions, Strategy} +import org.apache.spark.sql.catalyst.FunctionIdentifier +import org.apache.spark.sql.catalyst.analysis.FunctionRegistry.FunctionBuilder +import org.apache.spark.sql.catalyst.expressions.ExpressionInfo +import org.apache.spark.sql.catalyst.parser.ParserInterface +import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan +import org.apache.spark.sql.catalyst.rules.Rule +import org.apache.spark.sql.execution.SparkPlan + +import scala.collection.mutable + +/** Injector used to inject query planner rules into Spark. */ +class SparkInjector private[injector] { + private type RuleBuilder = SparkSession => Rule[LogicalPlan] + private type StrategyBuilder = SparkSession => Strategy + private type ParserBuilder = (SparkSession, ParserInterface) => ParserInterface + private type FunctionDescription = (FunctionIdentifier, ExpressionInfo, FunctionBuilder) + private type QueryStagePrepRuleBuilder = SparkSession => Rule[SparkPlan] + + private val queryStagePrepRuleBuilders = mutable.Buffer.empty[QueryStagePrepRuleBuilder] + private val parserBuilders = mutable.Buffer.empty[ParserBuilder] + private val resolutionRuleBuilders = mutable.Buffer.empty[RuleBuilder] + private val optimizerRules = mutable.Buffer.empty[RuleBuilder] + private val plannerStrategyBuilders = mutable.Buffer.empty[StrategyBuilder] + private val injectedFunctions = mutable.Buffer.empty[FunctionDescription] + private val postHocResolutionRuleBuilders = mutable.Buffer.empty[RuleBuilder] + + def injectQueryStagePrepRule(builder: QueryStagePrepRuleBuilder): Unit = { + queryStagePrepRuleBuilders += builder + } + + def injectParser(builder: ParserBuilder): Unit = { + parserBuilders += builder + } + + def injectResolutionRule(builder: RuleBuilder): Unit = { + resolutionRuleBuilders += builder + } + + def injectOptimizerRule(builder: RuleBuilder): Unit = { + optimizerRules += builder + } + + def injectPlannerStrategy(builder: StrategyBuilder): Unit = { + plannerStrategyBuilders += builder + } + + def injectFunction(functionDescription: FunctionDescription): Unit = { + injectedFunctions += functionDescription + } + + def injectPostHocResolutionRule(builder: RuleBuilder): Unit = { + postHocResolutionRuleBuilders += builder + } + + private[injector] def inject(extensions: SparkSessionExtensions): Unit = { + queryStagePrepRuleBuilders.foreach(extensions.injectQueryStagePrepRule) + parserBuilders.foreach(extensions.injectParser) + resolutionRuleBuilders.foreach(extensions.injectResolutionRule) + optimizerRules.foreach(extensions.injectOptimizerRule) + plannerStrategyBuilders.foreach(extensions.injectPlannerStrategy) + injectedFunctions.foreach(extensions.injectFunction) + postHocResolutionRuleBuilders.foreach(extensions.injectPostHocResolutionRule) + } +} diff --git a/gluten-core/src/main/scala/org/apache/gluten/planner/cost/GlutenCostModel.scala b/gluten-core/src/main/scala/org/apache/gluten/planner/cost/GlutenCostModel.scala index c028a9559331f..c4d7a42a26f05 100644 --- a/gluten-core/src/main/scala/org/apache/gluten/planner/cost/GlutenCostModel.scala +++ b/gluten-core/src/main/scala/org/apache/gluten/planner/cost/GlutenCostModel.scala @@ -17,20 +17,16 @@ package org.apache.gluten.planner.cost import org.apache.gluten.GlutenConfig -import org.apache.gluten.extension.columnar.enumerated.RemoveFilter -import org.apache.gluten.extension.columnar.transition.{ColumnarToRowLike, RowToColumnarLike} -import org.apache.gluten.planner.plan.GlutenPlanModel.GroupLeafExec -import org.apache.gluten.ras.{Cost, CostModel} -import org.apache.gluten.utils.PlanUtil +import org.apache.gluten.ras.CostModel import org.apache.spark.internal.Logging -import org.apache.spark.sql.execution.{ColumnarToRowExec, RowToColumnarExec, SparkPlan} +import org.apache.spark.sql.execution.SparkPlan import org.apache.spark.sql.utils.ReflectionUtil object GlutenCostModel extends Logging { def find(): CostModel[SparkPlan] = { - val aliases: Map[String, Class[_ <: CostModel[SparkPlan]]] = Map( - "rough" -> classOf[RoughCostModel]) + val aliases: Map[String, Class[_ <: CostModel[SparkPlan]]] = + Map("legacy" -> classOf[LegacyCostModel], "rough" -> classOf[RoughCostModel]) val aliasOrClass = GlutenConfig.getConf.rasCostModel val clazz: Class[_ <: CostModel[SparkPlan]] = if (aliases.contains(aliasOrClass)) { aliases(aliasOrClass) @@ -45,55 +41,5 @@ object GlutenCostModel extends Logging { model } - def rough(): CostModel[SparkPlan] = new RoughCostModel() - - private class RoughCostModel extends CostModel[SparkPlan] { - private val infLongCost = Long.MaxValue - - override def costOf(node: SparkPlan): GlutenCost = node match { - case _: GroupLeafExec => throw new IllegalStateException() - case _ => GlutenCost(longCostOf(node)) - } - - private def longCostOf(node: SparkPlan): Long = node match { - case n => - val selfCost = selfLongCostOf(n) - - // Sum with ceil to avoid overflow. - def safeSum(a: Long, b: Long): Long = { - assert(a >= 0) - assert(b >= 0) - val sum = a + b - if (sum < a || sum < b) Long.MaxValue else sum - } - - (n.children.map(longCostOf).toList :+ selfCost).reduce(safeSum) - } - - // A very rough estimation as of now. The cost model basically considers any - // fallen back ops as having extreme high cost so offloads computations as - // much as possible. - private def selfLongCostOf(node: SparkPlan): Long = { - node match { - case _: RemoveFilter.NoopFilter => - // To make planner choose the tree that has applied rule PushFilterToScan. - 0L - case ColumnarToRowExec(child) => 10L - case RowToColumnarExec(child) => 10L - case ColumnarToRowLike(child) => 10L - case RowToColumnarLike(child) => 10L - case p if PlanUtil.isGlutenColumnarOp(p) => 10L - case p if PlanUtil.isVanillaColumnarOp(p) => 1000L - // Other row ops. Usually a vanilla row op. - case _ => 1000L - } - } - - override def costComparator(): Ordering[Cost] = Ordering.Long.on { - case GlutenCost(value) => value - case _ => throw new IllegalStateException("Unexpected cost type") - } - - override def makeInfCost(): Cost = GlutenCost(infLongCost) - } + def legacy(): CostModel[SparkPlan] = new LegacyCostModel() } diff --git a/gluten-core/src/main/scala/org/apache/gluten/planner/cost/LegacyCostModel.scala b/gluten-core/src/main/scala/org/apache/gluten/planner/cost/LegacyCostModel.scala new file mode 100644 index 0000000000000..3b631872caa66 --- /dev/null +++ b/gluten-core/src/main/scala/org/apache/gluten/planner/cost/LegacyCostModel.scala @@ -0,0 +1,45 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.gluten.planner.cost + +import org.apache.gluten.extension.columnar.enumerated.RemoveFilter +import org.apache.gluten.extension.columnar.transition.{ColumnarToRowLike, RowToColumnarLike} +import org.apache.gluten.utils.PlanUtil + +import org.apache.spark.sql.execution.{ColumnarToRowExec, RowToColumnarExec, SparkPlan} + +class LegacyCostModel extends LongCostModel { + + // A very rough estimation as of now. The cost model basically considers any + // fallen back ops as having extreme high cost so offloads computations as + // much as possible. + override def selfLongCostOf(node: SparkPlan): Long = { + node match { + case _: RemoveFilter.NoopFilter => + // To make planner choose the tree that has applied rule PushFilterToScan. + 0L + case ColumnarToRowExec(_) => 10L + case RowToColumnarExec(_) => 10L + case ColumnarToRowLike(_) => 10L + case RowToColumnarLike(_) => 10L + case p if PlanUtil.isGlutenColumnarOp(p) => 10L + case p if PlanUtil.isVanillaColumnarOp(p) => 1000L + // Other row ops. Usually a vanilla row op. + case _ => 1000L + } + } +} diff --git a/gluten-core/src/main/scala/org/apache/gluten/planner/cost/LongCostModel.scala b/gluten-core/src/main/scala/org/apache/gluten/planner/cost/LongCostModel.scala new file mode 100644 index 0000000000000..d3c8410abf887 --- /dev/null +++ b/gluten-core/src/main/scala/org/apache/gluten/planner/cost/LongCostModel.scala @@ -0,0 +1,55 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.gluten.planner.cost + +import org.apache.gluten.planner.plan.GlutenPlanModel.GroupLeafExec +import org.apache.gluten.ras.{Cost, CostModel} + +import org.apache.spark.sql.execution.SparkPlan + +abstract class LongCostModel extends CostModel[SparkPlan] { + private val infLongCost = Long.MaxValue + + override def costOf(node: SparkPlan): GlutenCost = node match { + case _: GroupLeafExec => throw new IllegalStateException() + case _ => GlutenCost(longCostOf(node)) + } + + private def longCostOf(node: SparkPlan): Long = node match { + case n => + val selfCost = selfLongCostOf(n) + + // Sum with ceil to avoid overflow. + def safeSum(a: Long, b: Long): Long = { + assert(a >= 0) + assert(b >= 0) + val sum = a + b + if (sum < a || sum < b) Long.MaxValue else sum + } + + (n.children.map(longCostOf).toList :+ selfCost).reduce(safeSum) + } + + def selfLongCostOf(node: SparkPlan): Long + + override def costComparator(): Ordering[Cost] = Ordering.Long.on { + case GlutenCost(value) => value + case _ => throw new IllegalStateException("Unexpected cost type") + } + + override def makeInfCost(): Cost = GlutenCost(infLongCost) +} diff --git a/gluten-core/src/main/scala/org/apache/gluten/planner/cost/RoughCostModel.scala b/gluten-core/src/main/scala/org/apache/gluten/planner/cost/RoughCostModel.scala new file mode 100644 index 0000000000000..d621c3010c160 --- /dev/null +++ b/gluten-core/src/main/scala/org/apache/gluten/planner/cost/RoughCostModel.scala @@ -0,0 +1,53 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.gluten.planner.cost + +import org.apache.gluten.extension.columnar.enumerated.RemoveFilter +import org.apache.gluten.extension.columnar.transition.{ColumnarToRowLike, RowToColumnarLike} +import org.apache.gluten.utils.PlanUtil + +import org.apache.spark.sql.catalyst.expressions.{Alias, Attribute, NamedExpression} +import org.apache.spark.sql.execution.{ColumnarToRowExec, ProjectExec, RowToColumnarExec, SparkPlan} + +class RoughCostModel extends LongCostModel { + + override def selfLongCostOf(node: SparkPlan): Long = { + node match { + case _: RemoveFilter.NoopFilter => + // To make planner choose the tree that has applied rule PushFilterToScan. + 0L + case ProjectExec(projectList, _) if projectList.forall(isCheapExpression) => + // Make trivial ProjectExec has the same cost as ProjectExecTransform to reduce unnecessary + // c2r and r2c. + 10L + case ColumnarToRowExec(_) => 10L + case RowToColumnarExec(_) => 10L + case ColumnarToRowLike(_) => 10L + case RowToColumnarLike(_) => 10L + case p if PlanUtil.isGlutenColumnarOp(p) => 10L + case p if PlanUtil.isVanillaColumnarOp(p) => 1000L + // Other row ops. Usually a vanilla row op. + case _ => 1000L + } + } + + private def isCheapExpression(ne: NamedExpression): Boolean = ne match { + case Alias(_: Attribute, _) => true + case _: Attribute => true + case _ => false + } +} diff --git a/gluten-core/src/main/scala/org/apache/gluten/planner/plan/GlutenPlanModel.scala b/gluten-core/src/main/scala/org/apache/gluten/planner/plan/GlutenPlanModel.scala index 727613f563f0e..d981de8046a92 100644 --- a/gluten-core/src/main/scala/org/apache/gluten/planner/plan/GlutenPlanModel.scala +++ b/gluten-core/src/main/scala/org/apache/gluten/planner/plan/GlutenPlanModel.scala @@ -49,7 +49,7 @@ object GlutenPlanModel { override protected def doExecute(): RDD[InternalRow] = throw new IllegalStateException() override def output: Seq[Attribute] = metadata.schema().output - override def supportsColumnar(): Boolean = { + override def supportsColumnar: Boolean = { batchType != Convention.BatchType.None } diff --git a/gluten-core/src/main/scala/org/apache/gluten/substrait/SubstraitContext.scala b/gluten-core/src/main/scala/org/apache/gluten/substrait/SubstraitContext.scala index 6cc35bd16c6e7..79148d9f3093b 100644 --- a/gluten-core/src/main/scala/org/apache/gluten/substrait/SubstraitContext.scala +++ b/gluten-core/src/main/scala/org/apache/gluten/substrait/SubstraitContext.scala @@ -16,8 +16,6 @@ */ package org.apache.gluten.substrait -import org.apache.gluten.substrait.ddlplan.InsertOutputNode - import java.lang.{Long => JLong} import java.security.InvalidParameterException import java.util.{ArrayList => JArrayList, HashMap => JHashMap, List => JList, Map => JMap} @@ -64,16 +62,9 @@ class SubstraitContext extends Serializable { private val aggregationParamsMap = new JHashMap[JLong, AggregationParams]() private var iteratorIndex: JLong = 0L - private var insertOutputNode: InsertOutputNode = _ private var operatorId: JLong = 0L private var relId: JLong = 0L - def getInsertOutputNode: InsertOutputNode = insertOutputNode - - def setInsertOutputNode(insertOutputNode: InsertOutputNode): Unit = { - this.insertOutputNode = insertOutputNode - } - def registerFunction(funcName: String): JLong = { if (!functionMap.containsKey(funcName)) { val newFunctionId: JLong = functionMap.size.toLong diff --git a/gluten-core/src/main/java/org/apache/gluten/substrait/ddlplan/InsertOutputBuilder.java b/gluten-core/src/main/scala/org/apache/gluten/utils/FileIndexUtil.scala similarity index 71% rename from gluten-core/src/main/java/org/apache/gluten/substrait/ddlplan/InsertOutputBuilder.java rename to gluten-core/src/main/scala/org/apache/gluten/utils/FileIndexUtil.scala index 75146de0c58ab..dab593b24fc56 100644 --- a/gluten-core/src/main/java/org/apache/gluten/substrait/ddlplan/InsertOutputBuilder.java +++ b/gluten-core/src/main/scala/org/apache/gluten/utils/FileIndexUtil.scala @@ -14,13 +14,15 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.gluten.substrait.ddlplan; +package org.apache.gluten.utils -public class InsertOutputBuilder { - private InsertOutputBuilder() {} +import org.apache.spark.sql.execution.datasources._ - public static InsertOutputNode makeInsertOutputNode( - Long partsNum, String database, String tableName, String relativePath) { - return new InsertOutputNode(partsNum, database, tableName, relativePath); +object FileIndexUtil { + def getRootPath(index: FileIndex): Seq[String] = { + index.rootPaths + .filter(_.isAbsolute) + .map(_.toString) + .toSeq } } diff --git a/gluten-core/src/main/scala/org/apache/spark/sql/execution/ColumnarShuffleExchangeExec.scala b/gluten-core/src/main/scala/org/apache/spark/sql/execution/ColumnarShuffleExchangeExec.scala index 26527e1c81c90..f12e3ae0b33e0 100644 --- a/gluten-core/src/main/scala/org/apache/spark/sql/execution/ColumnarShuffleExchangeExec.scala +++ b/gluten-core/src/main/scala/org/apache/spark/sql/execution/ColumnarShuffleExchangeExec.scala @@ -32,7 +32,7 @@ import org.apache.spark.sql.catalyst.plans.logical.Statistics import org.apache.spark.sql.catalyst.plans.physical._ import org.apache.spark.sql.execution.CoalesceExec.EmptyPartition import org.apache.spark.sql.execution.exchange._ -import org.apache.spark.sql.execution.metric.{SQLMetric, SQLShuffleWriteMetricsReporter} +import org.apache.spark.sql.execution.metric.SQLShuffleWriteMetricsReporter import org.apache.spark.sql.metric.SQLColumnarShuffleReadMetricsReporter import org.apache.spark.sql.vectorized.ColumnarBatch @@ -53,7 +53,7 @@ case class ColumnarShuffleExchangeExec( SQLColumnarShuffleReadMetricsReporter.createShuffleReadMetrics(sparkContext) val useSortBasedShuffle: Boolean = - ColumnarShuffleExchangeExec.useSortBasedShuffle(outputPartitioning, output) + BackendsApiManager.getSparkPlanExecApiInstance.useSortBasedShuffle(outputPartitioning, output) // Note: "metrics" is made transient to avoid sending driver-side metrics to tasks. @transient override lazy val metrics = @@ -80,7 +80,7 @@ case class ColumnarShuffleExchangeExec( */ @transient lazy val columnarShuffleDependency: ShuffleDependency[Int, ColumnarBatch, ColumnarBatch] = { - ColumnarShuffleExchangeExec.prepareShuffleDependency( + BackendsApiManager.getSparkPlanExecApiInstance.genShuffleDependency( inputColumnarRDD, child.output, projectOutputAttributes, @@ -141,8 +141,13 @@ case class ColumnarShuffleExchangeExec( new ShuffledColumnarBatchRDD(columnarShuffleDependency, readMetrics, partitionSpecs) } - override def stringArgs: Iterator[Any] = - super.stringArgs ++ Iterator(s"[id=#$id]") + override def stringArgs: Iterator[Any] = { + val shuffleWriterType = { + if (useSortBasedShuffle) GlutenConfig.GLUTEN_SORT_SHUFFLE_WRITER + else GlutenConfig.GLUTEN_HASH_SHUFFLE_WRITER + } + super.stringArgs ++ Iterator(s"[shuffle_writer_type=$shuffleWriterType]") + } override def doExecute(): RDD[InternalRow] = { throw new UnsupportedOperationException() @@ -188,35 +193,6 @@ object ColumnarShuffleExchangeExec extends Logging { ) } - // scalastyle:off argcount - def prepareShuffleDependency( - rdd: RDD[ColumnarBatch], - childOutputAttributes: Seq[Attribute], - projectOutputAttributes: Seq[Attribute], - newPartitioning: Partitioning, - serializer: Serializer, - writeMetrics: Map[String, SQLMetric], - metrics: Map[String, SQLMetric], - isSortBasedShuffle: Boolean) - // scalastyle:on argcount - : ShuffleDependency[Int, ColumnarBatch, ColumnarBatch] = { - BackendsApiManager.getSparkPlanExecApiInstance.genShuffleDependency( - rdd, - childOutputAttributes, - projectOutputAttributes, - newPartitioning: Partitioning, - serializer: Serializer, - writeMetrics, - metrics, - isSortBasedShuffle) - } - - def useSortBasedShuffle(partitioning: Partitioning, output: Seq[Attribute]): Boolean = { - partitioning != SinglePartition && - (partitioning.numPartitions >= GlutenConfig.getConf.columnarShuffleSortPartitionsThreshold || - output.size >= GlutenConfig.getConf.columnarShuffleSortColumnsThreshold) - } - class DummyPairRDDWithPartitions(@transient private val sc: SparkContext, numPartitions: Int) extends RDD[Product2[Int, InternalRow]](sc, Nil) { diff --git a/gluten-core/src/main/scala/org/apache/spark/sql/execution/ColumnarWriteFilesExec.scala b/gluten-core/src/main/scala/org/apache/spark/sql/execution/ColumnarWriteFilesExec.scala index 6f04b84804c56..f8f596dcee1af 100644 --- a/gluten-core/src/main/scala/org/apache/spark/sql/execution/ColumnarWriteFilesExec.scala +++ b/gluten-core/src/main/scala/org/apache/spark/sql/execution/ColumnarWriteFilesExec.scala @@ -19,169 +19,71 @@ package org.apache.spark.sql.execution import org.apache.gluten.backendsapi.BackendsApiManager import org.apache.gluten.exception.GlutenException import org.apache.gluten.extension.GlutenPlan +import org.apache.gluten.extension.columnar.transition.Convention.{KnownRowType, RowType} +import org.apache.gluten.extension.columnar.transition.ConventionReq +import org.apache.gluten.extension.columnar.transition.ConventionReq.KnownChildrenConventions import org.apache.gluten.sql.shims.SparkShimLoader -import org.apache.spark.{Partition, SparkException, TaskContext, TaskOutputFileAlreadyExistException} -import org.apache.spark.internal.io.{FileCommitProtocol, SparkHadoopWriterUtils} +import org.apache.spark.TaskContext +import org.apache.spark.internal.io.FileCommitProtocol import org.apache.spark.rdd.RDD -import org.apache.spark.shuffle.FetchFailedException import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.catalog.BucketSpec import org.apache.spark.sql.catalyst.catalog.CatalogTypes.TablePartitionSpec import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeSet} import org.apache.spark.sql.connector.write.WriterCommitMessage import org.apache.spark.sql.execution.datasources._ -import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.vectorized.ColumnarBatch -import org.apache.spark.util.Utils - -import org.apache.hadoop.fs.FileAlreadyExistsException - -import java.util.Date - -/** - * This trait is used in [[ColumnarWriteFilesRDD]] to inject the staging write path before - * initializing the native plan and collect native write files metrics for each backend. - */ -trait BackendWrite { - def collectNativeWriteFilesMetrics(batch: ColumnarBatch): Option[WriteTaskResult] -} - -/** - * This RDD is used to make sure we have injected staging write path before initializing the native - * plan, and support Spark file commit protocol. - */ -class ColumnarWriteFilesRDD( - var prev: RDD[ColumnarBatch], - description: WriteJobDescription, - committer: FileCommitProtocol, - jobTrackerID: String) - extends RDD[WriterCommitMessage](prev) { - - private def reportTaskMetrics(writeTaskResult: WriteTaskResult): Unit = { - val stats = writeTaskResult.summary.stats.head.asInstanceOf[BasicWriteTaskStats] - val (numBytes, numWrittenRows) = (stats.numBytes, stats.numRows) - // Reports bytesWritten and recordsWritten to the Spark output metrics. - // We should update it after calling `commitTask` to overwrite the metrics. - Option(TaskContext.get()).map(_.taskMetrics().outputMetrics).foreach { - outputMetrics => - outputMetrics.setBytesWritten(numBytes) - outputMetrics.setRecordsWritten(numWrittenRows) - } - } - - private def writeFilesForEmptyIterator( - commitProtocol: SparkWriteFilesCommitProtocol): WriteTaskResult = { - val taskAttemptContext = commitProtocol.taskAttemptContext - - val dataWriter = - if (commitProtocol.sparkPartitionId != 0) { - // In case of empty job, leave first partition to save meta for file format like parquet. - new EmptyDirectoryDataWriter(description, taskAttemptContext, committer) - } else if (description.partitionColumns.isEmpty) { - new SingleDirectoryDataWriter(description, taskAttemptContext, committer) - } else { - new DynamicPartitionDataSingleWriter(description, taskAttemptContext, committer) - } - - // We have done `setupTask` outside - dataWriter.writeWithIterator(Iterator.empty) - dataWriter.commit() - } - - override def compute(split: Partition, context: TaskContext): Iterator[WriterCommitMessage] = { - val commitProtocol = new SparkWriteFilesCommitProtocol(jobTrackerID, description, committer) - val backendWrite = - BackendsApiManager.getSparkPlanExecApiInstance.createBackendWrite(description) - - commitProtocol.setupTask() - val writePath = commitProtocol.newTaskAttemptTempPath() - val writeFileName = commitProtocol.getFilename - logDebug(s"Native staging write path: $writePath and file name: $writeFileName") - - var writeTaskResult: WriteTaskResult = null - try { - Utils.tryWithSafeFinallyAndFailureCallbacks(block = { - BackendsApiManager.getIteratorApiInstance.injectWriteFilesTempPath(writePath, writeFileName) - - // Initialize the native plan - val iter = firstParent[ColumnarBatch].iterator(split, context) - assert(iter.hasNext) - val resultColumnarBatch = iter.next() - assert(resultColumnarBatch != null) - val nativeWriteTaskResult = backendWrite.collectNativeWriteFilesMetrics(resultColumnarBatch) - if (nativeWriteTaskResult.isEmpty) { - // If we are writing an empty iterator, then velox would do nothing. - // Here we fallback to use vanilla Spark write files to generate an empty file for - // metadata only. - writeTaskResult = writeFilesForEmptyIterator(commitProtocol) - // We have done commit task inside `writeFilesForEmptyIterator`. - } else { - writeTaskResult = nativeWriteTaskResult.get - commitProtocol.commitTask() - } - })( - catchBlock = { - // If there is an error, abort the task - commitProtocol.abortTask() - logError(s"Job ${commitProtocol.getJobId} aborted.") - } - ) - } catch { - case e: FetchFailedException => - throw e - case f: FileAlreadyExistsException if SQLConf.get.fastFailFileFormatOutput => - throw new TaskOutputFileAlreadyExistException(f) - case t: Throwable => - throw new SparkException( - s"Task failed while writing rows to staging path: $writePath, " + - s"output path: ${description.path}", - t) - } - - assert(writeTaskResult != null) - reportTaskMetrics(writeTaskResult) - Iterator.single(writeTaskResult) - } - - override protected def getPartitions: Array[Partition] = firstParent[ColumnarBatch].partitions - - override def clearDependencies(): Unit = { - super.clearDependencies() - prev = null - } -} // The class inherits from "BinaryExecNode" instead of "UnaryExecNode" because // we need to expose a dummy child (as right child) with type "WriteFilesExec" to let Spark // choose the new write code path (version >= 3.4). The actual plan to write is the left child // of this operator. -case class ColumnarWriteFilesExec private ( +abstract class ColumnarWriteFilesExec protected ( override val left: SparkPlan, - override val right: SparkPlan, - fileFormat: FileFormat, - partitionColumns: Seq[Attribute], - bucketSpec: Option[BucketSpec], - options: Map[String, String], - staticPartitions: TablePartitionSpec) + override val right: SparkPlan) extends BinaryExecNode with GlutenPlan + with KnownChildrenConventions + with KnownRowType with ColumnarWriteFilesExec.ExecuteWriteCompatible { val child: SparkPlan = left override lazy val references: AttributeSet = AttributeSet.empty - override def supportsColumnar(): Boolean = true + override def supportsColumnar: Boolean = true + + override def requiredChildrenConventions(): Seq[ConventionReq] = { + List(ConventionReq.backendBatch) + } + + /** + * Mark the plan node as outputting both row-based and columnar data. Then we could avoid + * unnecessary transitions from being added on the exit side of the node. + * + * This is feasible based on the assumption that the node doesn't actually involve in either row + * processing or columnar processing. It's true because Spark only calls `doExecuteWrite` of the + * object. + * + * Since https://github.com/apache/incubator-gluten/pull/6745. + */ + override def rowType(): RowType = { + RowType.VanillaRow + } override def output: Seq[Attribute] = Seq.empty override protected def doExecute(): RDD[InternalRow] = { - throw new GlutenException(s"$nodeName does not support doExecute") + throw new GlutenException(s"$nodeName does not implement #doExecute") + } + + override protected def doExecuteColumnar(): RDD[ColumnarBatch] = { + throw new GlutenException(s"$nodeName does not implement #doExecuteColumnar") } /** Fallback to use vanilla Spark write files to generate an empty file for metadata only. */ - private def writeFilesForEmptyRDD( + protected def writeFilesForEmptyRDD( description: WriteJobDescription, committer: FileCommitProtocol, jobTrackerID: String): RDD[WriterCommitMessage] = { @@ -205,25 +107,10 @@ case class ColumnarWriteFilesExec private ( } } + /** We need this to avoid compiler error. */ override def doExecuteWrite(writeFilesSpec: WriteFilesSpec): RDD[WriterCommitMessage] = { - assert(child.supportsColumnar) - - val rdd = child.executeColumnar() - val jobTrackerID = SparkHadoopWriterUtils.createJobTrackerID(new Date()) - val description = writeFilesSpec.description - val committer = writeFilesSpec.committer - if (rdd.partitions.length == 0) { - // SPARK-23271 If we are attempting to write a zero partition rdd, create a dummy single - // partition rdd to make sure we at least set up one write task to write the metadata. - writeFilesForEmptyRDD(description, committer, jobTrackerID) - } else { - new ColumnarWriteFilesRDD(rdd, description, committer, jobTrackerID) - } + throw new GlutenException(s"$nodeName does not implement #doExecuteWrite") } - override protected def withNewChildrenInternal( - newLeft: SparkPlan, - newRight: SparkPlan): SparkPlan = - copy(newLeft, newRight, fileFormat, partitionColumns, bucketSpec, options, staticPartitions) } object ColumnarWriteFilesExec { @@ -253,7 +140,7 @@ object ColumnarWriteFilesExec { options, staticPartitions) - ColumnarWriteFilesExec( + BackendsApiManager.getSparkPlanExecApiInstance.createColumnarWriteFilesExec( child, right, fileFormat, @@ -272,9 +159,7 @@ object ColumnarWriteFilesExec { sealed trait ExecuteWriteCompatible { // To be compatible with Spark (version < 3.4) protected def doExecuteWrite(writeFilesSpec: WriteFilesSpec): RDD[WriterCommitMessage] = { - throw new GlutenException( - s"Internal Error ${this.getClass} has write support" + - s" mismatch:\n${this}") + throw new GlutenException("Illegal state: The method is not expected to be called") } } } diff --git a/gluten-core/src/main/scala/org/apache/spark/sql/hive/HiveTableScanExecTransformer.scala b/gluten-core/src/main/scala/org/apache/spark/sql/hive/HiveTableScanExecTransformer.scala index 8133f1d4218bf..938bac2b1b2c5 100644 --- a/gluten-core/src/main/scala/org/apache/spark/sql/hive/HiveTableScanExecTransformer.scala +++ b/gluten-core/src/main/scala/org/apache/spark/sql/hive/HiveTableScanExecTransformer.scala @@ -76,6 +76,9 @@ case class HiveTableScanExecTransformer( Seq.empty } + // TODO: get root paths from hive table. + override def getRootPathsInternal: Seq[String] = Seq.empty + override def metricsUpdater(): MetricsUpdater = BackendsApiManager.getMetricsApiInstance.genHiveTableScanTransformerMetricsUpdater(metrics) diff --git a/gluten-core/src/main/scala/org/apache/spark/util/SparkDirectoryUtil.scala b/gluten-core/src/main/scala/org/apache/spark/util/SparkDirectoryUtil.scala index fbc59edfdd6be..833575178c664 100644 --- a/gluten-core/src/main/scala/org/apache/spark/util/SparkDirectoryUtil.scala +++ b/gluten-core/src/main/scala/org/apache/spark/util/SparkDirectoryUtil.scala @@ -79,7 +79,7 @@ object SparkDirectoryUtil extends Logging { return } if (INSTANCE.roots.toSet != roots.toSet) { - logWarning( + throw new IllegalArgumentException( s"Reinitialize SparkDirectoryUtil with different root dirs: old: ${INSTANCE.ROOTS .mkString("Array(", ", ", ")")}, new: ${roots.mkString("Array(", ", ", ")")}" ) diff --git a/gluten-core/src/main/scala/org/apache/spark/util/SparkRuleUtil.scala b/gluten-core/src/main/scala/org/apache/spark/util/SparkPlanRules.scala similarity index 55% rename from gluten-core/src/main/scala/org/apache/spark/util/SparkRuleUtil.scala rename to gluten-core/src/main/scala/org/apache/spark/util/SparkPlanRules.scala index 100ec36d24243..bbaee81a59879 100644 --- a/gluten-core/src/main/scala/org/apache/spark/util/SparkRuleUtil.scala +++ b/gluten-core/src/main/scala/org/apache/spark/util/SparkPlanRules.scala @@ -21,36 +21,48 @@ import org.apache.spark.sql.SparkSession import org.apache.spark.sql.catalyst.rules.Rule import org.apache.spark.sql.execution.SparkPlan -object SparkRuleUtil extends Logging { - - /** Add the extended pre/post column rules */ - def extendedColumnarRules( - session: SparkSession, - conf: String - ): List[SparkSession => Rule[SparkPlan]] = { - val extendedRules = conf.split(",").filter(_.nonEmpty) - extendedRules - .map { - ruleStr => +object SparkPlanRules extends Logging { + // Since https://github.com/apache/incubator-gluten/pull/1523 + def extendedColumnarRule(ruleNamesStr: String): SparkSession => Rule[SparkPlan] = + (session: SparkSession) => { + val ruleNames = ruleNamesStr.split(",").filter(_.nonEmpty) + val rules = ruleNames.flatMap { + ruleName => try { - val extensionConfClass = Utils.classForName(ruleStr) - val extensionConf = - extensionConfClass + val ruleClass = Utils.classForName(ruleName) + val rule = + ruleClass .getConstructor(classOf[SparkSession]) .newInstance(session) .asInstanceOf[Rule[SparkPlan]] - - Some((sparkSession: SparkSession) => extensionConf) + Some(rule) } catch { // Ignore the error if we cannot find the class or when the class has the wrong type. case e @ (_: ClassCastException | _: ClassNotFoundException | _: NoClassDefFoundError) => - logWarning(s"Cannot create extended rule $ruleStr", e) + logWarning(s"Cannot create extended rule $ruleName", e) None } } - .filter(_.isDefined) - .map(_.get) - .toList + new OrderedRules(rules) + } + + object EmptyRule extends Rule[SparkPlan] { + override def apply(plan: SparkPlan): SparkPlan = plan + } + + class AbortRule(message: String) extends Rule[SparkPlan] { + override def apply(plan: SparkPlan): SparkPlan = + throw new IllegalStateException( + "AbortRule is being executed, this should not happen. Reason: " + message) + } + + class OrderedRules(rules: Seq[Rule[SparkPlan]]) extends Rule[SparkPlan] { + override def apply(plan: SparkPlan): SparkPlan = { + rules.foldLeft(plan) { + case (plan, rule) => + rule.apply(plan) + } + } } } diff --git a/gluten-core/src/main/scala/org/apache/spark/util/SparkResourceUtil.scala b/gluten-core/src/main/scala/org/apache/spark/util/SparkResourceUtil.scala index b16d43de5d680..f8c791fe13749 100644 --- a/gluten-core/src/main/scala/org/apache/spark/util/SparkResourceUtil.scala +++ b/gluten-core/src/main/scala/org/apache/spark/util/SparkResourceUtil.scala @@ -76,4 +76,8 @@ object SparkResourceUtil extends Logging { val taskCores = conf.getInt("spark.task.cpus", 1) executorCores / taskCores } + + def isLocalMaster(conf: SparkConf): Boolean = { + Utils.isLocalMaster(conf) + } } diff --git a/gluten-core/src/test/java/org/apache/gluten/memory/memtarget/spark/TreeMemoryConsumerTest.java b/gluten-core/src/test/java/org/apache/gluten/memory/memtarget/spark/TreeMemoryConsumerTest.java index 1632e5ef45106..bbc43ba5dea98 100644 --- a/gluten-core/src/test/java/org/apache/gluten/memory/memtarget/spark/TreeMemoryConsumerTest.java +++ b/gluten-core/src/test/java/org/apache/gluten/memory/memtarget/spark/TreeMemoryConsumerTest.java @@ -17,6 +17,8 @@ package org.apache.gluten.memory.memtarget.spark; import org.apache.gluten.GlutenConfig; +import org.apache.gluten.memory.memtarget.MemoryTarget; +import org.apache.gluten.memory.memtarget.Spiller; import org.apache.gluten.memory.memtarget.Spillers; import org.apache.gluten.memory.memtarget.TreeMemoryTarget; @@ -28,6 +30,8 @@ import org.junit.Test; import java.util.Collections; +import java.util.concurrent.atomic.AtomicInteger; +import java.util.concurrent.atomic.AtomicLong; import scala.Function0; @@ -100,6 +104,82 @@ public void testIsolatedAndShared() { }); } + @Test + public void testSpill() { + test( + () -> { + final Spillers.AppendableSpillerList spillers = Spillers.appendable(); + final TreeMemoryTarget shared = + TreeMemoryConsumers.shared() + .newConsumer( + TaskContext.get().taskMemoryManager(), + "FOO", + spillers, + Collections.emptyMap()); + final AtomicInteger numSpills = new AtomicInteger(0); + final AtomicLong numSpilledBytes = new AtomicLong(0L); + spillers.append( + new Spiller() { + @Override + public long spill(MemoryTarget self, Phase phase, long size) { + long repaid = shared.repay(size); + numSpills.getAndIncrement(); + numSpilledBytes.getAndAdd(repaid); + return repaid; + } + }); + Assert.assertEquals(300, shared.borrow(300)); + Assert.assertEquals(300, shared.borrow(300)); + Assert.assertEquals(1, numSpills.get()); + Assert.assertEquals(200, numSpilledBytes.get()); + Assert.assertEquals(400, shared.usedBytes()); + + Assert.assertEquals(300, shared.borrow(300)); + Assert.assertEquals(300, shared.borrow(300)); + Assert.assertEquals(3, numSpills.get()); + Assert.assertEquals(800, numSpilledBytes.get()); + Assert.assertEquals(400, shared.usedBytes()); + }); + } + + @Test + public void testOverSpill() { + test( + () -> { + final Spillers.AppendableSpillerList spillers = Spillers.appendable(); + final TreeMemoryTarget shared = + TreeMemoryConsumers.shared() + .newConsumer( + TaskContext.get().taskMemoryManager(), + "FOO", + spillers, + Collections.emptyMap()); + final AtomicInteger numSpills = new AtomicInteger(0); + final AtomicLong numSpilledBytes = new AtomicLong(0L); + spillers.append( + new Spiller() { + @Override + public long spill(MemoryTarget self, Phase phase, long size) { + long repaid = shared.repay(Long.MAX_VALUE); + numSpills.getAndIncrement(); + numSpilledBytes.getAndAdd(repaid); + return repaid; + } + }); + Assert.assertEquals(300, shared.borrow(300)); + Assert.assertEquals(300, shared.borrow(300)); + Assert.assertEquals(1, numSpills.get()); + Assert.assertEquals(300, numSpilledBytes.get()); + Assert.assertEquals(300, shared.usedBytes()); + + Assert.assertEquals(300, shared.borrow(300)); + Assert.assertEquals(300, shared.borrow(300)); + Assert.assertEquals(3, numSpills.get()); + Assert.assertEquals(900, numSpilledBytes.get()); + Assert.assertEquals(300, shared.usedBytes()); + }); + } + private void test(Runnable r) { TaskResources$.MODULE$.runUnsafe( new Function0() { diff --git a/gluten-core/src/test/resources/SubStraitTest-Q6.dat b/gluten-core/src/test/resources/SubStraitTest-Q6.dat deleted file mode 100644 index f4858a30fc381..0000000000000 Binary files a/gluten-core/src/test/resources/SubStraitTest-Q6.dat and /dev/null differ diff --git a/gluten-core/src/test/scala/org/apache/gluten/test/FallbackUtil.scala b/gluten-core/src/test/scala/org/apache/gluten/test/FallbackUtil.scala index d2626ab275cea..3d26dd16c4eb3 100644 --- a/gluten-core/src/test/scala/org/apache/gluten/test/FallbackUtil.scala +++ b/gluten-core/src/test/scala/org/apache/gluten/test/FallbackUtil.scala @@ -20,11 +20,11 @@ import org.apache.gluten.extension.GlutenPlan import org.apache.spark.internal.Logging import org.apache.spark.sql.execution._ -import org.apache.spark.sql.execution.adaptive.{AdaptiveSparkPlanExec, AdaptiveSparkPlanHelper, QueryStageExec} +import org.apache.spark.sql.execution.adaptive.{AdaptiveSparkPlanExec, AdaptiveSparkPlanHelper, AQEShuffleReadExec, QueryStageExec} import org.apache.spark.sql.execution.exchange.ReusedExchangeExec /** - * attention: if AQE is enable,This method will only be executed correctly after the execution plan + * attention: if AQE is enabled,This method will only be executed correctly after the execution plan * is fully determined */ @@ -42,10 +42,14 @@ object FallbackUtil extends Logging with AdaptiveSparkPlanHelper { true case WholeStageCodegenExec(_) => true + case ColumnarInputAdapter(_) => + true case InputAdapter(_) => true case AdaptiveSparkPlanExec(_, _, _, _, _) => true + case AQEShuffleReadExec(_, _) => + true case _: LimitExec => true // for ut @@ -57,30 +61,15 @@ object FallbackUtil extends Logging with AdaptiveSparkPlanHelper { true case _: ReusedExchangeExec => true - case p: SparkPlan if p.supportsColumnar => - true case _ => false } } def hasFallback(plan: SparkPlan): Boolean = { - var fallbackOperator: Seq[SparkPlan] = null - if (plan.isInstanceOf[AdaptiveSparkPlanExec]) { - fallbackOperator = collectWithSubqueries(plan) { - case plan if !plan.isInstanceOf[GlutenPlan] && !skip(plan) => - plan - } - } else { - fallbackOperator = plan.collectWithSubqueries { - case plan if !plan.isInstanceOf[GlutenPlan] && !skip(plan) => - plan - } - } - - if (fallbackOperator.nonEmpty) { - fallbackOperator.foreach(operator => log.info(s"gluten fallback operator:{$operator}")) - } + val fallbackOperator = collectWithSubqueries(plan) { case plan => plan }.filterNot( + plan => plan.isInstanceOf[GlutenPlan] || skip(plan)) + fallbackOperator.foreach(operator => log.info(s"gluten fallback operator:{$operator}")) fallbackOperator.nonEmpty } } diff --git a/gluten-core/src/test/scala/org/apache/spark/softaffinity/SoftAffinitySuite.scala b/gluten-core/src/test/scala/org/apache/spark/softaffinity/SoftAffinitySuite.scala index c6c4fcc5fa1f5..ea3e50e812825 100644 --- a/gluten-core/src/test/scala/org/apache/spark/softaffinity/SoftAffinitySuite.scala +++ b/gluten-core/src/test/scala/org/apache/spark/softaffinity/SoftAffinitySuite.scala @@ -39,6 +39,8 @@ class SoftAffinitySuite extends QueryTest with SharedSparkSession with Predicate .set(GlutenConfig.GLUTEN_SOFT_AFFINITY_REPLICATIONS_NUM, "2") .set(GlutenConfig.GLUTEN_SOFT_AFFINITY_MIN_TARGET_HOSTS, "2") + val scalaVersion = scala.util.Properties.versionNumberString + def generateNativePartition1(): Unit = { val partition = FilePartition( 0, @@ -97,7 +99,13 @@ class SoftAffinitySuite extends QueryTest with SharedSparkSession with Predicate val nativePartition = GlutenPartition(0, PlanBuilder.EMPTY_PLAN, locations = locations) - assertResult(Set("host-1", "host-4", "host-5")) { + val affinityResultSet = if (scalaVersion.startsWith("2.12")) { + Set("host-1", "host-4", "host-5") + } else if (scalaVersion.startsWith("2.13")) { + Set("host-5", "host-4", "host-2") + } + + assertResult(affinityResultSet) { nativePartition.preferredLocations().toSet } } @@ -184,7 +192,13 @@ class SoftAffinitySuite extends QueryTest with SharedSparkSession with Predicate val nativePartition = GlutenPartition(0, PlanBuilder.EMPTY_PLAN, locations = locations) - assertResult(Set("host-1", "host-5", "host-6")) { + val affinityResultSet = if (scalaVersion.startsWith("2.12")) { + Set("host-1", "host-5", "host-6") + } else if (scalaVersion.startsWith("2.13")) { + Set("host-6", "host-5", "host-2") + } + + assertResult(affinityResultSet) { nativePartition.preferredLocations().toSet } } diff --git a/gluten-data/pom.xml b/gluten-data/pom.xml index 500708d449694..bca3143cf6c6c 100644 --- a/gluten-data/pom.xml +++ b/gluten-data/pom.xml @@ -87,13 +87,13 @@ org.apache.arrow ${arrow-memory.artifact} - ${arrow.version} + ${arrow-gluten.version} runtime org.apache.arrow arrow-memory-core - ${arrow.version} + ${arrow-gluten.version} compile @@ -109,7 +109,7 @@ org.apache.arrow arrow-vector - ${arrow.version} + ${arrow-gluten.version} io.netty @@ -195,6 +195,52 @@ + + org.apache.spark + spark-core_${scala.binary.version} + test-jar + test + + + org.apache.spark + spark-sql_${scala.binary.version} + test-jar + test + + + org.apache.spark + spark-catalyst_${scala.binary.version} + test-jar + test + + + org.scalatest + scalatest_${scala.binary.version} + test + + + org.mockito + mockito-core + 2.23.4 + test + + + junit + junit + test + + + org.scalatestplus + scalatestplus-mockito_${scala.binary.version} + 1.0.0-M2 + test + + + org.scalatestplus + scalatestplus-scalacheck_${scala.binary.version} + 3.1.0.0-RC2 + test + diff --git a/gluten-data/src/main/java/org/apache/gluten/columnarbatch/ColumnarBatchJniWrapper.java b/gluten-data/src/main/java/org/apache/gluten/columnarbatch/ColumnarBatchJniWrapper.java index e71e9d7bee1b5..37376951c5435 100644 --- a/gluten-data/src/main/java/org/apache/gluten/columnarbatch/ColumnarBatchJniWrapper.java +++ b/gluten-data/src/main/java/org/apache/gluten/columnarbatch/ColumnarBatchJniWrapper.java @@ -16,8 +16,8 @@ */ package org.apache.gluten.columnarbatch; -import org.apache.gluten.exec.Runtime; -import org.apache.gluten.exec.RuntimeAware; +import org.apache.gluten.runtime.Runtime; +import org.apache.gluten.runtime.RuntimeAware; public class ColumnarBatchJniWrapper implements RuntimeAware { private final Runtime runtime; diff --git a/gluten-data/src/main/java/org/apache/gluten/columnarbatch/ColumnarBatches.java b/gluten-data/src/main/java/org/apache/gluten/columnarbatch/ColumnarBatches.java index a72eaafae4d91..fd9c72c36060b 100644 --- a/gluten-data/src/main/java/org/apache/gluten/columnarbatch/ColumnarBatches.java +++ b/gluten-data/src/main/java/org/apache/gluten/columnarbatch/ColumnarBatches.java @@ -16,12 +16,13 @@ */ package org.apache.gluten.columnarbatch; -import org.apache.gluten.exception.GlutenException; -import org.apache.gluten.exec.Runtime; -import org.apache.gluten.exec.Runtimes; +import org.apache.gluten.memory.arrow.alloc.ArrowBufferAllocators; +import org.apache.gluten.runtime.Runtime; +import org.apache.gluten.runtime.Runtimes; import org.apache.gluten.utils.ArrowAbiUtil; import org.apache.gluten.utils.ArrowUtil; import org.apache.gluten.utils.ImplicitClass; +import org.apache.gluten.utils.InternalRowUtl; import org.apache.gluten.vectorized.ArrowWritableColumnVector; import com.google.common.annotations.VisibleForTesting; @@ -32,26 +33,19 @@ import org.apache.arrow.memory.BufferAllocator; import org.apache.spark.sql.catalyst.InternalRow; import org.apache.spark.sql.catalyst.expressions.UnsafeRow; +import org.apache.spark.sql.types.StructType; +import org.apache.spark.sql.utils.SparkArrowUtil; import org.apache.spark.sql.vectorized.ColumnVector; import org.apache.spark.sql.vectorized.ColumnarBatch; +import org.apache.spark.sql.vectorized.ColumnarBatchUtil; -import java.lang.reflect.Field; import java.util.Arrays; import java.util.Iterator; import java.util.NoSuchElementException; +import scala.collection.JavaConverters; + public class ColumnarBatches { - private static final Field FIELD_COLUMNS; - - static { - try { - Field f = ColumnarBatch.class.getDeclaredField("columns"); - f.setAccessible(true); - FIELD_COLUMNS = f; - } catch (NoSuchFieldException e) { - throw new GlutenException(e); - } - } private ColumnarBatches() {} @@ -90,21 +84,6 @@ private static BatchType identifyBatchType(ColumnarBatch batch) { return BatchType.HEAVY; } - private static void transferVectors(ColumnarBatch from, ColumnarBatch target) { - try { - if (target.numCols() != from.numCols()) { - throw new IllegalStateException(); - } - final ColumnVector[] newVectors = new ColumnVector[from.numCols()]; - for (int i = 0; i < target.numCols(); i++) { - newVectors[i] = from.column(i); - } - FIELD_COLUMNS.set(target, newVectors); - } catch (IllegalAccessException e) { - throw new GlutenException(e); - } - } - /** Heavy batch: Data is readable from JVM and formatted as Arrow data. */ public static boolean isHeavyBatch(ColumnarBatch batch) { return identifyBatchType(batch) == BatchType.HEAVY; @@ -201,8 +180,9 @@ private static ColumnarBatch load(BufferAllocator allocator, ColumnarBatch input } // populate new vectors to input - transferVectors(output, input); - return input; + ColumnarBatchUtil.transferVectors(output, input); + + return output; } } @@ -236,7 +216,7 @@ private static ColumnarBatch offload(BufferAllocator allocator, ColumnarBatch in } // populate new vectors to input - transferVectors(output, input); + ColumnarBatchUtil.transferVectors(output, input); return input; } } @@ -379,4 +359,11 @@ public static void release(ColumnarBatch b) { public static long getNativeHandle(ColumnarBatch batch) { return getIndicatorVector(batch).handle(); } + + public static String toString(ColumnarBatch batch, int start, int length) { + ColumnarBatch loadedBatch = ensureLoaded(ArrowBufferAllocators.contextInstance(), batch); + StructType type = SparkArrowUtil.fromArrowSchema(ArrowUtil.toSchema(loadedBatch)); + return InternalRowUtl.toString( + type, JavaConverters.asScalaIterator(loadedBatch.rowIterator()), start, length); + } } diff --git a/gluten-data/src/main/java/org/apache/gluten/columnarbatch/IndicatorVectorBase.java b/gluten-data/src/main/java/org/apache/gluten/columnarbatch/IndicatorVectorBase.java index 1bc685bd5cebe..700eb3cadeeee 100644 --- a/gluten-data/src/main/java/org/apache/gluten/columnarbatch/IndicatorVectorBase.java +++ b/gluten-data/src/main/java/org/apache/gluten/columnarbatch/IndicatorVectorBase.java @@ -16,7 +16,7 @@ */ package org.apache.gluten.columnarbatch; -import org.apache.gluten.exec.Runtimes; +import org.apache.gluten.runtime.Runtimes; import org.apache.spark.sql.types.DataTypes; import org.apache.spark.sql.types.Decimal; diff --git a/gluten-data/src/main/java/org/apache/gluten/datasource/DatasourceJniWrapper.java b/gluten-data/src/main/java/org/apache/gluten/datasource/DatasourceJniWrapper.java index 11ed3fb7df8c9..4550dbd9a4d84 100644 --- a/gluten-data/src/main/java/org/apache/gluten/datasource/DatasourceJniWrapper.java +++ b/gluten-data/src/main/java/org/apache/gluten/datasource/DatasourceJniWrapper.java @@ -16,9 +16,9 @@ */ package org.apache.gluten.datasource; -import org.apache.gluten.exec.Runtime; -import org.apache.gluten.exec.RuntimeAware; import org.apache.gluten.init.JniUtils; +import org.apache.gluten.runtime.Runtime; +import org.apache.gluten.runtime.RuntimeAware; import org.apache.spark.sql.execution.datasources.BlockStripes; diff --git a/gluten-data/src/main/java/org/apache/gluten/memory/listener/ManagedReservationListener.java b/gluten-data/src/main/java/org/apache/gluten/memory/listener/ManagedReservationListener.java index 4af8eb4e3f827..7c7fac8daacd2 100644 --- a/gluten-data/src/main/java/org/apache/gluten/memory/listener/ManagedReservationListener.java +++ b/gluten-data/src/main/java/org/apache/gluten/memory/listener/ManagedReservationListener.java @@ -19,7 +19,6 @@ import org.apache.gluten.memory.SimpleMemoryUsageRecorder; import org.apache.gluten.memory.memtarget.MemoryTarget; -import com.google.common.base.Preconditions; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -29,16 +28,23 @@ public class ManagedReservationListener implements ReservationListener { private static final Logger LOG = LoggerFactory.getLogger(ManagedReservationListener.class); private final MemoryTarget target; - private final SimpleMemoryUsageRecorder sharedUsage; // shared task metrics + // Metrics shared by task. + private final SimpleMemoryUsageRecorder sharedUsage; + // Lock shared by task. Using a common lock avoids ABBA deadlock + // when multiple listeners created under the same TMM. + // See: https://github.com/apache/incubator-gluten/issues/6622 + private final Object sharedLock; - public ManagedReservationListener(MemoryTarget target, SimpleMemoryUsageRecorder sharedUsage) { + public ManagedReservationListener( + MemoryTarget target, SimpleMemoryUsageRecorder sharedUsage, Object sharedLock) { this.target = target; this.sharedUsage = sharedUsage; + this.sharedLock = sharedLock; } @Override public long reserve(long size) { - synchronized (this) { + synchronized (sharedLock) { try { long granted = target.borrow(size); sharedUsage.inc(granted); @@ -52,11 +58,10 @@ public long reserve(long size) { @Override public long unreserve(long size) { - synchronized (this) { + synchronized (sharedLock) { try { long freed = target.repay(size); sharedUsage.inc(-freed); - Preconditions.checkState(freed == size); return freed; } catch (Exception e) { LOG.error("Error unreserving memory from target", e); diff --git a/gluten-data/src/main/java/org/apache/gluten/memory/listener/ReservationListeners.java b/gluten-data/src/main/java/org/apache/gluten/memory/listener/ReservationListeners.java index 47b9937eb7a3a..db5ac8426df0f 100644 --- a/gluten-data/src/main/java/org/apache/gluten/memory/listener/ReservationListeners.java +++ b/gluten-data/src/main/java/org/apache/gluten/memory/listener/ReservationListeners.java @@ -29,7 +29,8 @@ public final class ReservationListeners { public static final ReservationListener NOOP = - new ManagedReservationListener(new NoopMemoryTarget(), new SimpleMemoryUsageRecorder()); + new ManagedReservationListener( + new NoopMemoryTarget(), new SimpleMemoryUsageRecorder(), new Object()); public static ReservationListener create( String name, Spiller spiller, Map mutableStats) { @@ -46,32 +47,31 @@ private static ReservationListener create0( final double overAcquiredRatio = GlutenConfig.getConf().memoryOverAcquiredRatio(); final long reservationBlockSize = GlutenConfig.getConf().memoryReservationBlockSize(); final TaskMemoryManager tmm = TaskResources.getLocalTaskContext().taskMemoryManager(); + final TreeMemoryTarget consumer = + MemoryTargets.newConsumer( + tmm, name, Spillers.withMinSpillSize(spiller, reservationBlockSize), mutableStats); + final MemoryTarget overConsumer = + MemoryTargets.newConsumer( + tmm, + consumer.name() + ".OverAcquire", + new Spiller() { + @Override + public long spill(MemoryTarget self, Phase phase, long size) { + if (!Spillers.PHASE_SET_ALL.contains(phase)) { + return 0L; + } + return self.repay(size); + } + }, + Collections.emptyMap()); final MemoryTarget target = MemoryTargets.throwOnOom( MemoryTargets.overAcquire( - MemoryTargets.dynamicOffHeapSizingIfEnabled( - MemoryTargets.newConsumer( - tmm, - name, - Spillers.withMinSpillSize(spiller, reservationBlockSize), - mutableStats)), - MemoryTargets.dynamicOffHeapSizingIfEnabled( - MemoryTargets.newConsumer( - tmm, - "OverAcquire.DummyTarget", - new Spiller() { - @Override - public long spill(MemoryTarget self, Spiller.Phase phase, long size) { - if (!Spillers.PHASE_SET_ALL.contains(phase)) { - return 0L; - } - return self.repay(size); - } - }, - Collections.emptyMap())), + MemoryTargets.dynamicOffHeapSizingIfEnabled(consumer), + MemoryTargets.dynamicOffHeapSizingIfEnabled(overConsumer), overAcquiredRatio)); // Listener. - return new ManagedReservationListener(target, TaskResources.getSharedUsage()); + return new ManagedReservationListener(target, TaskResources.getSharedUsage(), tmm); } } diff --git a/gluten-data/src/main/java/org/apache/gluten/exec/RuntimeAware.java b/gluten-data/src/main/java/org/apache/gluten/runtime/RuntimeAware.java similarity index 96% rename from gluten-data/src/main/java/org/apache/gluten/exec/RuntimeAware.java rename to gluten-data/src/main/java/org/apache/gluten/runtime/RuntimeAware.java index ca96ace644927..5caef9a691873 100644 --- a/gluten-data/src/main/java/org/apache/gluten/exec/RuntimeAware.java +++ b/gluten-data/src/main/java/org/apache/gluten/runtime/RuntimeAware.java @@ -14,7 +14,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.gluten.exec; +package org.apache.gluten.runtime; /** * This defines the base abstraction for the contextual objects that can be transmitted to C++ side diff --git a/gluten-data/src/main/java/org/apache/gluten/exec/RuntimeJniWrapper.java b/gluten-data/src/main/java/org/apache/gluten/runtime/RuntimeJniWrapper.java similarity index 97% rename from gluten-data/src/main/java/org/apache/gluten/exec/RuntimeJniWrapper.java rename to gluten-data/src/main/java/org/apache/gluten/runtime/RuntimeJniWrapper.java index d2a18e9b4930c..80f9509d9eaaa 100644 --- a/gluten-data/src/main/java/org/apache/gluten/exec/RuntimeJniWrapper.java +++ b/gluten-data/src/main/java/org/apache/gluten/runtime/RuntimeJniWrapper.java @@ -14,7 +14,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.gluten.exec; +package org.apache.gluten.runtime; import org.apache.gluten.memory.listener.ReservationListener; diff --git a/gluten-data/src/main/java/org/apache/gluten/vectorized/ColumnarBatchInIterator.java b/gluten-data/src/main/java/org/apache/gluten/vectorized/ColumnarBatchInIterator.java index c147862d01395..ef7d7167c80d8 100644 --- a/gluten-data/src/main/java/org/apache/gluten/vectorized/ColumnarBatchInIterator.java +++ b/gluten-data/src/main/java/org/apache/gluten/vectorized/ColumnarBatchInIterator.java @@ -18,8 +18,8 @@ import org.apache.gluten.columnarbatch.ColumnarBatchJniWrapper; import org.apache.gluten.columnarbatch.ColumnarBatches; -import org.apache.gluten.exec.Runtimes; import org.apache.gluten.memory.arrow.alloc.ArrowBufferAllocators; +import org.apache.gluten.runtime.Runtimes; import org.apache.spark.sql.vectorized.ColumnarBatch; diff --git a/gluten-data/src/main/java/org/apache/gluten/vectorized/ColumnarBatchOutIterator.java b/gluten-data/src/main/java/org/apache/gluten/vectorized/ColumnarBatchOutIterator.java index 9dd0404384ad4..4f436e58d071b 100644 --- a/gluten-data/src/main/java/org/apache/gluten/vectorized/ColumnarBatchOutIterator.java +++ b/gluten-data/src/main/java/org/apache/gluten/vectorized/ColumnarBatchOutIterator.java @@ -17,9 +17,9 @@ package org.apache.gluten.vectorized; import org.apache.gluten.columnarbatch.ColumnarBatches; -import org.apache.gluten.exec.Runtime; -import org.apache.gluten.exec.RuntimeAware; import org.apache.gluten.metrics.IMetrics; +import org.apache.gluten.runtime.Runtime; +import org.apache.gluten.runtime.RuntimeAware; import org.apache.spark.sql.vectorized.ColumnarBatch; diff --git a/gluten-data/src/main/java/org/apache/gluten/vectorized/ColumnarBatchSerializerJniWrapper.java b/gluten-data/src/main/java/org/apache/gluten/vectorized/ColumnarBatchSerializerJniWrapper.java index bfe0d756112f4..d78cbdab2403d 100644 --- a/gluten-data/src/main/java/org/apache/gluten/vectorized/ColumnarBatchSerializerJniWrapper.java +++ b/gluten-data/src/main/java/org/apache/gluten/vectorized/ColumnarBatchSerializerJniWrapper.java @@ -16,8 +16,8 @@ */ package org.apache.gluten.vectorized; -import org.apache.gluten.exec.Runtime; -import org.apache.gluten.exec.RuntimeAware; +import org.apache.gluten.runtime.Runtime; +import org.apache.gluten.runtime.RuntimeAware; public class ColumnarBatchSerializerJniWrapper implements RuntimeAware { private final Runtime runtime; diff --git a/gluten-data/src/main/java/org/apache/gluten/vectorized/GlutenSplitResult.java b/gluten-data/src/main/java/org/apache/gluten/vectorized/GlutenSplitResult.java index dbc0d7db51913..3bed6ac794fe5 100644 --- a/gluten-data/src/main/java/org/apache/gluten/vectorized/GlutenSplitResult.java +++ b/gluten-data/src/main/java/org/apache/gluten/vectorized/GlutenSplitResult.java @@ -17,6 +17,7 @@ package org.apache.gluten.vectorized; public class GlutenSplitResult extends SplitResult { + private final long bytesToEvict; private final long peakBytes; private final long sortTime; private final long c2rTime; @@ -30,6 +31,7 @@ public GlutenSplitResult( long totalC2RTime, long totalBytesWritten, long totalBytesEvicted, + long totalBytesToEvict, // In-memory bytes(uncompressed) before spill. long peakBytes, long[] partitionLengths, long[] rawPartitionLengths) { @@ -42,11 +44,16 @@ public GlutenSplitResult( totalBytesEvicted, partitionLengths, rawPartitionLengths); + this.bytesToEvict = totalBytesToEvict; this.peakBytes = peakBytes; this.sortTime = totalSortTime; this.c2rTime = totalC2RTime; } + public long getBytesToEvict() { + return bytesToEvict; + } + public long getPeakBytes() { return peakBytes; } diff --git a/gluten-data/src/main/java/org/apache/gluten/vectorized/NativeColumnarToRowJniWrapper.java b/gluten-data/src/main/java/org/apache/gluten/vectorized/NativeColumnarToRowJniWrapper.java index ffcb77ad32c89..947f4c8166ce9 100644 --- a/gluten-data/src/main/java/org/apache/gluten/vectorized/NativeColumnarToRowJniWrapper.java +++ b/gluten-data/src/main/java/org/apache/gluten/vectorized/NativeColumnarToRowJniWrapper.java @@ -16,8 +16,8 @@ */ package org.apache.gluten.vectorized; -import org.apache.gluten.exec.Runtime; -import org.apache.gluten.exec.RuntimeAware; +import org.apache.gluten.runtime.Runtime; +import org.apache.gluten.runtime.RuntimeAware; public class NativeColumnarToRowJniWrapper implements RuntimeAware { private final Runtime runtime; diff --git a/gluten-data/src/main/java/org/apache/gluten/vectorized/NativePlanEvaluator.java b/gluten-data/src/main/java/org/apache/gluten/vectorized/NativePlanEvaluator.java index e5eea029b2b3d..8fb18e439a601 100644 --- a/gluten-data/src/main/java/org/apache/gluten/vectorized/NativePlanEvaluator.java +++ b/gluten-data/src/main/java/org/apache/gluten/vectorized/NativePlanEvaluator.java @@ -17,11 +17,11 @@ package org.apache.gluten.vectorized; import org.apache.gluten.backendsapi.BackendsApiManager; -import org.apache.gluten.exec.Runtime; -import org.apache.gluten.exec.Runtimes; import org.apache.gluten.memory.memtarget.MemoryTarget; import org.apache.gluten.memory.memtarget.Spiller; import org.apache.gluten.memory.memtarget.Spillers; +import org.apache.gluten.runtime.Runtime; +import org.apache.gluten.runtime.Runtimes; import org.apache.gluten.utils.DebugUtil; import org.apache.gluten.validate.NativePlanValidationInfo; diff --git a/gluten-data/src/main/java/org/apache/gluten/vectorized/NativeRowToColumnarJniWrapper.java b/gluten-data/src/main/java/org/apache/gluten/vectorized/NativeRowToColumnarJniWrapper.java index 1185d52fee773..c561174b2ba2e 100644 --- a/gluten-data/src/main/java/org/apache/gluten/vectorized/NativeRowToColumnarJniWrapper.java +++ b/gluten-data/src/main/java/org/apache/gluten/vectorized/NativeRowToColumnarJniWrapper.java @@ -16,8 +16,8 @@ */ package org.apache.gluten.vectorized; -import org.apache.gluten.exec.Runtime; -import org.apache.gluten.exec.RuntimeAware; +import org.apache.gluten.runtime.Runtime; +import org.apache.gluten.runtime.RuntimeAware; public class NativeRowToColumnarJniWrapper implements RuntimeAware { private final Runtime runtime; diff --git a/gluten-data/src/main/java/org/apache/gluten/vectorized/PlanEvaluatorJniWrapper.java b/gluten-data/src/main/java/org/apache/gluten/vectorized/PlanEvaluatorJniWrapper.java index eecd7c9e28790..ee6a26a41a36e 100644 --- a/gluten-data/src/main/java/org/apache/gluten/vectorized/PlanEvaluatorJniWrapper.java +++ b/gluten-data/src/main/java/org/apache/gluten/vectorized/PlanEvaluatorJniWrapper.java @@ -16,8 +16,8 @@ */ package org.apache.gluten.vectorized; -import org.apache.gluten.exec.Runtime; -import org.apache.gluten.exec.RuntimeAware; +import org.apache.gluten.runtime.Runtime; +import org.apache.gluten.runtime.RuntimeAware; import org.apache.gluten.validate.NativePlanValidationInfo; /** diff --git a/gluten-data/src/main/java/org/apache/gluten/vectorized/ShuffleReaderJniWrapper.java b/gluten-data/src/main/java/org/apache/gluten/vectorized/ShuffleReaderJniWrapper.java index 515486e45a5bc..3d2f9e119732f 100644 --- a/gluten-data/src/main/java/org/apache/gluten/vectorized/ShuffleReaderJniWrapper.java +++ b/gluten-data/src/main/java/org/apache/gluten/vectorized/ShuffleReaderJniWrapper.java @@ -16,8 +16,8 @@ */ package org.apache.gluten.vectorized; -import org.apache.gluten.exec.Runtime; -import org.apache.gluten.exec.RuntimeAware; +import org.apache.gluten.runtime.Runtime; +import org.apache.gluten.runtime.RuntimeAware; public class ShuffleReaderJniWrapper implements RuntimeAware { private final Runtime runtime; diff --git a/gluten-data/src/main/java/org/apache/gluten/vectorized/ShuffleWriterJniWrapper.java b/gluten-data/src/main/java/org/apache/gluten/vectorized/ShuffleWriterJniWrapper.java index 1d622d491eb51..23c7118afc6e6 100644 --- a/gluten-data/src/main/java/org/apache/gluten/vectorized/ShuffleWriterJniWrapper.java +++ b/gluten-data/src/main/java/org/apache/gluten/vectorized/ShuffleWriterJniWrapper.java @@ -16,8 +16,8 @@ */ package org.apache.gluten.vectorized; -import org.apache.gluten.exec.Runtime; -import org.apache.gluten.exec.RuntimeAware; +import org.apache.gluten.runtime.Runtime; +import org.apache.gluten.runtime.RuntimeAware; import java.io.IOException; diff --git a/gluten-data/src/main/java/org/apache/spark/sql/vectorized/ColumnarBatchUtil.java b/gluten-data/src/main/java/org/apache/spark/sql/vectorized/ColumnarBatchUtil.java new file mode 100644 index 0000000000000..0e2c748130387 --- /dev/null +++ b/gluten-data/src/main/java/org/apache/spark/sql/vectorized/ColumnarBatchUtil.java @@ -0,0 +1,72 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.sql.vectorized; + +import org.apache.gluten.columnarbatch.ColumnarBatches; +import org.apache.gluten.exception.GlutenException; + +import java.lang.reflect.Field; + +public class ColumnarBatchUtil { + + private static final Field FIELD_COLUMNS; + private static final Field FIELD_COLUMNAR_BATCH_ROW; + + static { + try { + Field f = ColumnarBatch.class.getDeclaredField("columns"); + f.setAccessible(true); + FIELD_COLUMNS = f; + Field row = ColumnarBatch.class.getDeclaredField("row"); + row.setAccessible(true); + FIELD_COLUMNAR_BATCH_ROW = row; + } catch (NoSuchFieldException e) { + throw new GlutenException(e); + } + } + + private static void setColumnarBatchRow( + ColumnarBatch from, ColumnVector[] columns, ColumnarBatch target) { + ColumnarBatchRow newRow = new ColumnarBatchRow(columns); + try { + ColumnarBatchRow row = (ColumnarBatchRow) FIELD_COLUMNAR_BATCH_ROW.get(from); + newRow.rowId = row.rowId; + FIELD_COLUMNAR_BATCH_ROW.set(target, newRow); + } catch (IllegalAccessException e) { + throw new GlutenException(e); + } + } + + public static void transferVectors(ColumnarBatch from, ColumnarBatch target) { + try { + if (target.numCols() != from.numCols()) { + throw new IllegalStateException(); + } + final ColumnVector[] newVectors = new ColumnVector[from.numCols()]; + for (int i = 0; i < target.numCols(); i++) { + newVectors[i] = from.column(i); + } + FIELD_COLUMNS.set(target, newVectors); + // Light batch does not need the row. + if (ColumnarBatches.isHeavyBatch(target)) { + setColumnarBatchRow(from, newVectors, target); + } + } catch (IllegalAccessException e) { + throw new GlutenException(e); + } + } +} diff --git a/gluten-data/src/main/scala/org/apache/gluten/exec/Runtime.scala b/gluten-data/src/main/scala/org/apache/gluten/runtime/Runtime.scala similarity index 87% rename from gluten-data/src/main/scala/org/apache/gluten/exec/Runtime.scala rename to gluten-data/src/main/scala/org/apache/gluten/runtime/Runtime.scala index 1f632659eadf8..7ba72b379ba34 100644 --- a/gluten-data/src/main/scala/org/apache/gluten/exec/Runtime.scala +++ b/gluten-data/src/main/scala/org/apache/gluten/runtime/Runtime.scala @@ -14,7 +14,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.gluten.exec +package org.apache.gluten.runtime import org.apache.gluten.GlutenConfig import org.apache.gluten.backendsapi.BackendsApiManager @@ -44,7 +44,7 @@ trait Runtime { } object Runtime { - private[exec] def apply(name: String): Runtime with TaskResource = { + private[runtime] def apply(name: String): Runtime with TaskResource = { new RuntimeImpl(name) } @@ -97,16 +97,17 @@ object Runtime { throw new GlutenException( s"Runtime instance already released: $handle, ${resourceName()}, ${priority()}") } + + def dump(): KnownNameAndStats = { + new KnownNameAndStats() { + override def name: String = resourceName() + override def stats: MemoryUsageStats = collectMemoryUsage() + } + } + if (LOGGER.isDebugEnabled) { LOGGER.debug( - SparkMemoryUtil.prettyPrintStats( - "About to release memory manager, usage dump:", - new KnownNameAndStats() { - override def name: String = resourceName() - - override def stats: MemoryUsageStats = collectMemoryUsage() - } - )) + SparkMemoryUtil.prettyPrintStats("About to release memory manager, usage dump:", dump())) } RuntimeJniWrapper.releaseRuntime(handle) @@ -115,10 +116,11 @@ object Runtime { LOGGER.warn( String.format( "%s Reservation listener %s still reserved non-zero bytes, which may cause memory" + - " leak, size: %s. ", + " leak, size: %s, dump: %s ", name, rl.toString, - SparkMemoryUtil.bytesToString(rl.getUsedBytes) + SparkMemoryUtil.bytesToString(rl.getUsedBytes), + dump() )) } } diff --git a/gluten-data/src/main/scala/org/apache/gluten/exec/Runtimes.scala b/gluten-data/src/main/scala/org/apache/gluten/runtime/Runtimes.scala similarity index 97% rename from gluten-data/src/main/scala/org/apache/gluten/exec/Runtimes.scala rename to gluten-data/src/main/scala/org/apache/gluten/runtime/Runtimes.scala index 3614fe05f6b60..6d5e11afeff54 100644 --- a/gluten-data/src/main/scala/org/apache/gluten/exec/Runtimes.scala +++ b/gluten-data/src/main/scala/org/apache/gluten/runtime/Runtimes.scala @@ -14,7 +14,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.gluten.exec +package org.apache.gluten.runtime import org.apache.spark.util.{TaskResource, TaskResources} diff --git a/gluten-data/src/main/scala/org/apache/gluten/utils/ArrowAbiUtil.scala b/gluten-data/src/main/scala/org/apache/gluten/utils/ArrowAbiUtil.scala index 442ae74bac987..8c6161e0c44c0 100644 --- a/gluten-data/src/main/scala/org/apache/gluten/utils/ArrowAbiUtil.scala +++ b/gluten-data/src/main/scala/org/apache/gluten/utils/ArrowAbiUtil.scala @@ -119,7 +119,7 @@ object ArrowAbiUtil { } } - def exportField(allocator: BufferAllocator, field: Field, out: ArrowSchema) { + def exportField(allocator: BufferAllocator, field: Field, out: ArrowSchema): Unit = { val dictProvider = new CDataDictionaryProvider try { Data.exportField(allocator, field, dictProvider, out) @@ -128,7 +128,7 @@ object ArrowAbiUtil { } } - def exportSchema(allocator: BufferAllocator, schema: Schema, out: ArrowSchema) { + def exportSchema(allocator: BufferAllocator, schema: Schema, out: ArrowSchema): Unit = { val dictProvider = new CDataDictionaryProvider try { Data.exportSchema(allocator, schema, dictProvider, out) diff --git a/gluten-data/src/main/scala/org/apache/gluten/vectorized/ColumnarBatchSerializer.scala b/gluten-data/src/main/scala/org/apache/gluten/vectorized/ColumnarBatchSerializer.scala index 379fb4885fb3a..0bd78cb92c5cc 100644 --- a/gluten-data/src/main/scala/org/apache/gluten/vectorized/ColumnarBatchSerializer.scala +++ b/gluten-data/src/main/scala/org/apache/gluten/vectorized/ColumnarBatchSerializer.scala @@ -17,8 +17,8 @@ package org.apache.gluten.vectorized import org.apache.gluten.GlutenConfig -import org.apache.gluten.exec.Runtimes import org.apache.gluten.memory.arrow.alloc.ArrowBufferAllocators +import org.apache.gluten.runtime.Runtimes import org.apache.gluten.utils.ArrowAbiUtil import org.apache.spark.SparkEnv diff --git a/gluten-data/src/main/scala/org/apache/spark/shuffle/ColumnarShuffleWriter.scala b/gluten-data/src/main/scala/org/apache/spark/shuffle/ColumnarShuffleWriter.scala index d62ff1d68d6d6..251bb977f3248 100644 --- a/gluten-data/src/main/scala/org/apache/spark/shuffle/ColumnarShuffleWriter.scala +++ b/gluten-data/src/main/scala/org/apache/spark/shuffle/ColumnarShuffleWriter.scala @@ -18,8 +18,8 @@ package org.apache.spark.shuffle import org.apache.gluten.GlutenConfig import org.apache.gluten.columnarbatch.ColumnarBatches -import org.apache.gluten.exec.Runtimes import org.apache.gluten.memory.memtarget.{MemoryTarget, Spiller, Spillers} +import org.apache.gluten.runtime.Runtimes import org.apache.gluten.vectorized._ import org.apache.spark._ @@ -210,6 +210,8 @@ class ColumnarShuffleWriter[K, V]( dep.metrics("peakBytes").add(splitResult.getPeakBytes) writeMetrics.incBytesWritten(splitResult.getTotalBytesWritten) writeMetrics.incWriteTime(splitResult.getTotalWriteTime + splitResult.getTotalSpillTime) + taskContext.taskMetrics().incMemoryBytesSpilled(splitResult.getBytesToEvict) + taskContext.taskMetrics().incDiskBytesSpilled(splitResult.getTotalBytesSpilled) partitionLengths = splitResult.getPartitionLengths try { diff --git a/gluten-data/src/main/scala/org/apache/spark/sql/execution/ColumnarBuildSideRelation.scala b/gluten-data/src/main/scala/org/apache/spark/sql/execution/ColumnarBuildSideRelation.scala index 9f13ea967a8d8..cb65dbca4db09 100644 --- a/gluten-data/src/main/scala/org/apache/spark/sql/execution/ColumnarBuildSideRelation.scala +++ b/gluten-data/src/main/scala/org/apache/spark/sql/execution/ColumnarBuildSideRelation.scala @@ -17,8 +17,8 @@ package org.apache.spark.sql.execution import org.apache.gluten.columnarbatch.ColumnarBatches -import org.apache.gluten.exec.Runtimes import org.apache.gluten.memory.arrow.alloc.ArrowBufferAllocators +import org.apache.gluten.runtime.Runtimes import org.apache.gluten.sql.shims.SparkShimLoader import org.apache.gluten.utils.ArrowAbiUtil import org.apache.gluten.utils.iterator.Iterators diff --git a/gluten-data/src/main/scala/org/apache/spark/sql/execution/utils/ExecUtil.scala b/gluten-data/src/main/scala/org/apache/spark/sql/execution/utils/ExecUtil.scala index 94bdc73a5b502..65b06214e73be 100644 --- a/gluten-data/src/main/scala/org/apache/spark/sql/execution/utils/ExecUtil.scala +++ b/gluten-data/src/main/scala/org/apache/spark/sql/execution/utils/ExecUtil.scala @@ -17,8 +17,8 @@ package org.apache.spark.sql.execution.utils import org.apache.gluten.columnarbatch.ColumnarBatches -import org.apache.gluten.exec.Runtimes import org.apache.gluten.memory.arrow.alloc.ArrowBufferAllocators +import org.apache.gluten.runtime.Runtimes import org.apache.gluten.utils.iterator.Iterators import org.apache.gluten.vectorized.{ArrowWritableColumnVector, NativeColumnarToRowInfo, NativeColumnarToRowJniWrapper, NativePartitioning} diff --git a/gluten-data/src/main/scala/org/apache/spark/sql/utils/SparkArrowUtil.scala b/gluten-data/src/main/scala/org/apache/spark/sql/utils/SparkArrowUtil.scala index 014956d84e9c4..ec6ac35af3e76 100644 --- a/gluten-data/src/main/scala/org/apache/spark/sql/utils/SparkArrowUtil.scala +++ b/gluten-data/src/main/scala/org/apache/spark/sql/utils/SparkArrowUtil.scala @@ -134,7 +134,7 @@ object SparkArrowUtil { val dt = fromArrowField(child) StructField(child.getName, dt, child.isNullable) } - StructType(fields) + StructType(fields.toSeq) case arrowType => fromArrowType(arrowType) } } @@ -147,7 +147,7 @@ object SparkArrowUtil { } def fromArrowSchema(schema: Schema): StructType = { - StructType(schema.getFields.asScala.map { + StructType(schema.getFields.asScala.toSeq.map { field => val dt = fromArrowField(field) StructField(field.getName, dt, field.isNullable) diff --git a/gluten-data/src/test/scala/org/apache/gluten/execution/MassiveMemoryAllocationSuite.scala b/gluten-data/src/test/scala/org/apache/gluten/execution/MassiveMemoryAllocationSuite.scala new file mode 100644 index 0000000000000..ebfa0e6123fdd --- /dev/null +++ b/gluten-data/src/test/scala/org/apache/gluten/execution/MassiveMemoryAllocationSuite.scala @@ -0,0 +1,167 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.gluten.execution + +import org.apache.gluten.memory.MemoryUsageStatsBuilder +import org.apache.gluten.memory.listener.{ReservationListener, ReservationListeners} +import org.apache.gluten.memory.memtarget.{MemoryTarget, Spiller, Spillers} + +import org.apache.spark.SparkFunSuite +import org.apache.spark.sql.test.SharedSparkSession +import org.apache.spark.util.TaskResources + +import java.util.concurrent.{Callable, Executors, TimeUnit} +import java.util.concurrent.atomic.AtomicLong + +import scala.collection.JavaConverters._ +import scala.util.Random + +class MassiveMemoryAllocationSuite extends SparkFunSuite with SharedSparkSession { + test("concurrent allocation with spill - shared listener") { + val numThreads = 50 + val offHeapSize = 500 + val minExtraSpillSize = 2 + val maxExtraSpillSize = 5 + val numAllocations = 100 + val minAllocationSize = 40 + val maxAllocationSize = 100 + val minAllocationDelayMs = 0 + val maxAllocationDelayMs = 0 + withSQLConf("spark.memory.offHeap.size" -> s"$offHeapSize") { + val total = new AtomicLong(0L) + TaskResources.runUnsafe { + val spiller = Spillers.appendable() + val listener = ReservationListeners.create( + s"listener", + spiller, + Map[String, MemoryUsageStatsBuilder]().asJava) + spiller.append(new Spiller() { + override def spill(self: MemoryTarget, phase: Spiller.Phase, size: Long): Long = { + val extraSpillSize = randomInt(minExtraSpillSize, maxExtraSpillSize) + val spillSize = size + extraSpillSize + val released = listener.unreserve(spillSize) + assert(released <= spillSize) + total.getAndAdd(-released) + spillSize + } + }) + val pool = Executors.newFixedThreadPool(numThreads) + val tasks = (0 until numThreads).map { + _ => + new Callable[Unit]() { + override def call(): Unit = { + (0 until numAllocations).foreach { + _ => + val allocSize = + randomInt(minAllocationSize, maxAllocationSize) + val granted = listener.reserve(allocSize) + assert(granted == allocSize) + total.getAndAdd(granted) + val sleepMs = + randomInt(minAllocationDelayMs, maxAllocationDelayMs) + Thread.sleep(sleepMs) + } + } + } + }.toList + val futures = pool.invokeAll(tasks.asJava) + pool.shutdown() + pool.awaitTermination(60, TimeUnit.SECONDS) + futures.forEach(_.get()) + val totalBytes = total.get() + val released = listener.unreserve(totalBytes) + assert(released == totalBytes) + assert(listener.getUsedBytes == 0) + } + } + } + + test("concurrent allocation with spill - dedicated listeners") { + val numThreads = 50 + val offHeapSize = 500 + val minExtraSpillSize = 2 + val maxExtraSpillSize = 5 + val numAllocations = 100 + val minAllocationSize = 40 + val maxAllocationSize = 100 + val minAllocationDelayMs = 0 + val maxAllocationDelayMs = 0 + withSQLConf("spark.memory.offHeap.size" -> s"$offHeapSize") { + TaskResources.runUnsafe { + val total = new AtomicLong(0L) + + def newListener(id: Int): ReservationListener = { + val spiller = Spillers.appendable() + val listener = ReservationListeners.create( + s"listener $id", + spiller, + Map[String, MemoryUsageStatsBuilder]().asJava) + spiller.append(new Spiller() { + override def spill(self: MemoryTarget, phase: Spiller.Phase, size: Long): Long = { + val extraSpillSize = randomInt(minExtraSpillSize, maxExtraSpillSize) + val spillSize = size + extraSpillSize + val released = listener.unreserve(spillSize) + assert(released <= spillSize) + total.getAndAdd(-released) + spillSize + } + }) + listener + } + + val listeners = (0 until numThreads).map(newListener).toList + val pool = Executors.newFixedThreadPool(numThreads) + val tasks = (0 until numThreads).map { + i => + new Callable[Unit]() { + override def call(): Unit = { + val listener = listeners(i) + (0 until numAllocations).foreach { + _ => + val allocSize = + randomInt(minAllocationSize, maxAllocationSize) + val granted = listener.reserve(allocSize) + assert(granted == allocSize) + total.getAndAdd(granted) + val sleepMs = + randomInt(minAllocationDelayMs, maxAllocationDelayMs) + Thread.sleep(sleepMs) + } + } + } + }.toList + val futures = pool.invokeAll(tasks.asJava) + pool.shutdown() + pool.awaitTermination(60, TimeUnit.SECONDS) + futures.forEach(_.get()) + val totalBytes = total.get() + val remaining = listeners.foldLeft(totalBytes) { + case (remainingBytes, listener) => + assert(remainingBytes >= 0) + val unreserved = listener.unreserve(remainingBytes) + remainingBytes - unreserved + } + assert(remaining == 0) + assert(listeners.map(_.getUsedBytes).sum == 0) + } + } + } + + private def randomInt(from: Int, to: Int): Int = { + from + Random.nextInt(to - from + 1) + } +} diff --git a/gluten-delta/pom.xml b/gluten-delta/pom.xml index 253d5a8e4f4ff..08e2060d0f633 100755 --- a/gluten-delta/pom.xml +++ b/gluten-delta/pom.xml @@ -62,10 +62,6 @@ test-jar test - - org.apache.spark - spark-core_${scala.binary.version} - org.apache.spark spark-core_${scala.binary.version} diff --git a/gluten-delta/src/main/scala/org/apache/gluten/extension/DeltaRewriteTransformerRules.scala b/gluten-delta/src/main/scala/org/apache/gluten/extension/DeltaRewriteTransformerRules.scala index 76eb53dbd0227..fed837d308be6 100644 --- a/gluten-delta/src/main/scala/org/apache/gluten/extension/DeltaRewriteTransformerRules.scala +++ b/gluten-delta/src/main/scala/org/apache/gluten/extension/DeltaRewriteTransformerRules.scala @@ -28,7 +28,7 @@ import org.apache.spark.sql.delta.{DeltaColumnMapping, DeltaParquetFileFormat, N import org.apache.spark.sql.execution.SparkPlan import org.apache.spark.sql.execution.datasources.FileFormat -import scala.collection._ +import scala.collection.mutable.ListBuffer class DeltaRewriteTransformerRules extends RewriteTransformerRules { override def rules: Seq[Rule[SparkPlan]] = columnMappingRule :: Nil @@ -87,8 +87,8 @@ object DeltaRewriteTransformerRules { )(SparkSession.active) // transform output's name into physical name so Reader can read data correctly // should keep the columns order the same as the origin output - val originColumnNames = mutable.ListBuffer.empty[String] - val transformedAttrs = mutable.ListBuffer.empty[Attribute] + val originColumnNames = ListBuffer.empty[String] + val transformedAttrs = ListBuffer.empty[Attribute] def mapAttribute(attr: Attribute) = { val newAttr = if (!plan.isMetadataColumn(attr)) { DeltaColumnMapping @@ -142,7 +142,7 @@ object DeltaRewriteTransformerRules { val expr = (transformedAttrs, originColumnNames).zipped.map { (attr, columnName) => Alias(attr, columnName)(exprId = attr.exprId) } - val projectExecTransformer = ProjectExecTransformer(expr, scanExecTransformer) + val projectExecTransformer = ProjectExecTransformer(expr.toSeq, scanExecTransformer) projectExecTransformer case _ => plan } diff --git a/gluten-iceberg/src/main/scala/org/apache/gluten/execution/IcebergScanTransformer.scala b/gluten-iceberg/src/main/scala/org/apache/gluten/execution/IcebergScanTransformer.scala index 5a735b802adbe..9fb8521d9df5b 100644 --- a/gluten-iceberg/src/main/scala/org/apache/gluten/execution/IcebergScanTransformer.scala +++ b/gluten-iceberg/src/main/scala/org/apache/gluten/execution/IcebergScanTransformer.scala @@ -54,6 +54,9 @@ case class IcebergScanTransformer( override def getInputFilePathsInternal: Seq[String] = Seq.empty + // TODO: get root paths from table. + override def getRootPathsInternal: Seq[String] = Seq.empty + override lazy val fileFormat: ReadFileFormat = GlutenIcebergSourceUtil.getFileFormat(scan) override def getSplitInfosFromPartitions(partitions: Seq[InputPartition]): Seq[SplitInfo] = { diff --git a/gluten-ras/common/src/test/scala/org/apache/gluten/ras/path/PathFinderSuite.scala b/gluten-ras/common/src/test/scala/org/apache/gluten/ras/path/PathFinderSuite.scala index b5ea3fc3cf6ec..4b3a675cd8433 100644 --- a/gluten-ras/common/src/test/scala/org/apache/gluten/ras/path/PathFinderSuite.scala +++ b/gluten-ras/common/src/test/scala/org/apache/gluten/ras/path/PathFinderSuite.scala @@ -262,18 +262,18 @@ class PathFinderSuite extends AnyFunSuite { assert(path.plan() == Binary(n1, Group(1), Group(2))) assert( - path.dive(state, 1).map(_.plan()) == List( + path.dive(state, 1).map(_.plan()).toList == List( Binary(n1, Unary(n2, Group(3)), Unary(n3, Group(4))))) assert( - path.dive(state, 2).map(_.plan()) == List( + path.dive(state, 2).map(_.plan()).toList == List( Binary(n1, Unary(n2, Leaf(n4, 1)), Unary(n3, Leaf(n5, 1))), Binary(n1, Unary(n2, Leaf(n4, 1)), Unary(n3, Leaf(n6, 1))))) assert( - path.dive(state, 3).map(_.plan()) == List( + path.dive(state, 3).map(_.plan()).toList == List( Binary(n1, Unary(n2, Leaf(n4, 1)), Unary(n3, Leaf(n5, 1))), Binary(n1, Unary(n2, Leaf(n4, 1)), Unary(n3, Leaf(n6, 1))))) assert( - path.dive(state, RasPath.INF_DEPTH).map(_.plan()) == List( + path.dive(state, RasPath.INF_DEPTH).map(_.plan()).toList == List( Binary(n1, Unary(n2, Leaf(n4, 1)), Unary(n3, Leaf(n5, 1))), Binary(n1, Unary(n2, Leaf(n4, 1)), Unary(n3, Leaf(n6, 1))))) } @@ -338,13 +338,13 @@ class PathFinderSuite extends AnyFunSuite { path.dive(state, 1).map(_.plan()).toSeq == List( Binary(n1, Binary(n2, Group(3), Group(4)), Leaf(n3, 1)))) assert( - path.dive(state, 2).map(_.plan()) == List( + path.dive(state, 2).map(_.plan()).toList == List( Binary(n1, Binary(n2, Leaf(n4, 1), Leaf(n5, 1)), Leaf(n3, 1)))) assert( - path.dive(state, 3).map(_.plan()) == List( + path.dive(state, 3).map(_.plan()).toList == List( Binary(n1, Binary(n2, Leaf(n4, 1), Leaf(n5, 1)), Leaf(n3, 1)))) assert( - path.dive(state, RasPath.INF_DEPTH).map(_.plan()) == List( + path.dive(state, RasPath.INF_DEPTH).map(_.plan()).toList == List( Binary(n1, Binary(n2, Leaf(n4, 1), Leaf(n5, 1)), Leaf(n3, 1)))) } } diff --git a/gluten-ras/common/src/test/scala/org/apache/gluten/ras/specific/CyclicSearchSpaceSuite.scala b/gluten-ras/common/src/test/scala/org/apache/gluten/ras/specific/CyclicSearchSpaceSuite.scala index d27292fb5361e..077921b697bfb 100644 --- a/gluten-ras/common/src/test/scala/org/apache/gluten/ras/specific/CyclicSearchSpaceSuite.scala +++ b/gluten-ras/common/src/test/scala/org/apache/gluten/ras/specific/CyclicSearchSpaceSuite.scala @@ -65,11 +65,12 @@ abstract class CyclicSearchSpaceSuite extends AnyFunSuite { PathFinder.builder(ras, mockState).depth(depth).build().find(can) } - assert(find(node1, 1).map(p => p.plan()) == List(Unary("node1", Group(0)))) - assert(find(node1, 2).map(p => p.plan()) == List(Unary("node1", Leaf("node2", 1)))) - assert(find(node1, 3).map(p => p.plan()) == List(Unary("node1", Leaf("node2", 1)))) + assert(find(node1, 1).map(p => p.plan()).toList == List(Unary("node1", Group(0)))) + assert(find(node1, 2).map(p => p.plan()).toList == List(Unary("node1", Leaf("node2", 1)))) + assert(find(node1, 3).map(p => p.plan()).toList == List(Unary("node1", Leaf("node2", 1)))) assert( - find(node1, RasPath.INF_DEPTH).map(p => p.plan()) == List(Unary("node1", Leaf("node2", 1)))) + find(node1, RasPath.INF_DEPTH).map(p => p.plan()).toList == List( + Unary("node1", Leaf("node2", 1)))) } test("Cyclic - find best, simple self cycle") { diff --git a/gluten-uniffle/velox/src/main/java/org/apache/spark/shuffle/writer/VeloxUniffleColumnarShuffleWriter.java b/gluten-uniffle/velox/src/main/java/org/apache/spark/shuffle/writer/VeloxUniffleColumnarShuffleWriter.java index d2032fa48564a..2219fc674431e 100644 --- a/gluten-uniffle/velox/src/main/java/org/apache/spark/shuffle/writer/VeloxUniffleColumnarShuffleWriter.java +++ b/gluten-uniffle/velox/src/main/java/org/apache/spark/shuffle/writer/VeloxUniffleColumnarShuffleWriter.java @@ -18,11 +18,11 @@ import org.apache.gluten.GlutenConfig; import org.apache.gluten.columnarbatch.ColumnarBatches; -import org.apache.gluten.exec.Runtime; -import org.apache.gluten.exec.Runtimes; import org.apache.gluten.memory.memtarget.MemoryTarget; import org.apache.gluten.memory.memtarget.Spiller; import org.apache.gluten.memory.memtarget.Spillers; +import org.apache.gluten.runtime.Runtime; +import org.apache.gluten.runtime.Runtimes; import org.apache.gluten.vectorized.ShuffleWriterJniWrapper; import org.apache.gluten.vectorized.SplitResult; @@ -66,7 +66,7 @@ public class VeloxUniffleColumnarShuffleWriter extends RssShuffleWriter null)); } - compressionLevel = GlutenShuffleUtils.getCompressionLevel(sparkConf, compressionCodec, null); } @Override diff --git a/gluten-ut/common/pom.xml b/gluten-ut/common/pom.xml index 145c66ffa026b..14cb212b648d1 100644 --- a/gluten-ut/common/pom.xml +++ b/gluten-ut/common/pom.xml @@ -40,10 +40,6 @@ org.apache.maven.plugins maven-compiler-plugin - - 1.8 - 1.8 - compile diff --git a/gluten-ut/common/src/test/scala/org/apache/gluten/utils/BackendTestSettings.scala b/gluten-ut/common/src/test/scala/org/apache/gluten/utils/BackendTestSettings.scala index 987635d067be6..dce8ac83710cb 100644 --- a/gluten-ut/common/src/test/scala/org/apache/gluten/utils/BackendTestSettings.scala +++ b/gluten-ut/common/src/test/scala/org/apache/gluten/utils/BackendTestSettings.scala @@ -30,7 +30,10 @@ abstract class BackendTestSettings { private val enabledSuites: java.util.Map[String, SuiteSettings] = new util.HashMap() protected def enableSuite[T: ClassTag]: SuiteSettings = { - val suiteName = implicitly[ClassTag[T]].runtimeClass.getCanonicalName + enableSuite(implicitly[ClassTag[T]].runtimeClass.getCanonicalName) + } + + protected def enableSuite(suiteName: String): SuiteSettings = { if (enabledSuites.containsKey(suiteName)) { throw new IllegalArgumentException("Duplicated suite name: " + suiteName) } diff --git a/gluten-ut/pom.xml b/gluten-ut/pom.xml index 90644b832bf82..a016eccaed201 100644 --- a/gluten-ut/pom.xml +++ b/gluten-ut/pom.xml @@ -31,7 +31,7 @@ gluten-ut pom - Gluten Unit Test + Gluten Unit Test Parent org.slf4j diff --git a/gluten-ut/spark34/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala b/gluten-ut/spark34/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala index ab288e835b127..77c12621efebc 100644 --- a/gluten-ut/spark34/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala +++ b/gluten-ut/spark34/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala @@ -775,6 +775,7 @@ class ClickHouseTestSettings extends BackendTestSettings { .exclude("REPEAT") .exclude("ParseUrl") .exclude("SPARK-33468: ParseUrl in ANSI mode should fail if input string is not a valid url") + .exclude("FORMAT") // refer https://github.com/apache/incubator-gluten/issues/6765 enableSuite[GlutenDataSourceV2DataFrameSessionCatalogSuite] enableSuite[GlutenDataSourceV2SQLSessionCatalogSuite] enableSuite[GlutenDataSourceV2SQLSuiteV1Filter] diff --git a/gluten-ut/spark34/src/test/scala/org/apache/spark/sql/GlutenColumnExpressionSuite.scala b/gluten-ut/spark34/src/test/scala/org/apache/spark/sql/GlutenColumnExpressionSuite.scala index 437cef29215ce..dbc70bf74598d 100644 --- a/gluten-ut/spark34/src/test/scala/org/apache/spark/sql/GlutenColumnExpressionSuite.scala +++ b/gluten-ut/spark34/src/test/scala/org/apache/spark/sql/GlutenColumnExpressionSuite.scala @@ -34,7 +34,7 @@ class GlutenColumnExpressionSuite extends ColumnExpressionSuite with GlutenSQLTe strDf.select(raise_error($"a")).collect() } assert(e2.getCause.isInstanceOf[RuntimeException]) - assert(e2.getCause.getMessage contains "hello") + assert(e2.getCause.getMessage.contains("hello")) } testGluten("assert_true") { @@ -58,7 +58,7 @@ class GlutenColumnExpressionSuite extends ColumnExpressionSuite with GlutenSQLTe nullDf.select(assert_true($"cond", $"n")).collect() } assert(e2.getCause.isInstanceOf[RuntimeException]) - assert(e2.getCause.getMessage contains "first row") + assert(e2.getCause.getMessage.contains("first row")) // assert_true(condition) val intDf = Seq((0, 1)).toDF("a", "b") @@ -67,7 +67,7 @@ class GlutenColumnExpressionSuite extends ColumnExpressionSuite with GlutenSQLTe intDf.select(assert_true($"a" > $"b")).collect() } assert(e3.getCause.isInstanceOf[RuntimeException]) - assert(e3.getCause.getMessage contains "'('a > 'b)' is not true!") + assert(e3.getCause.getMessage.contains("'('a > 'b)' is not true!")) } testGluten( diff --git a/gluten-ut/spark34/src/test/scala/org/apache/spark/sql/GlutenSQLQueryTestSuite.scala b/gluten-ut/spark34/src/test/scala/org/apache/spark/sql/GlutenSQLQueryTestSuite.scala index 8a291990ea31f..65578eb52bcad 100644 --- a/gluten-ut/spark34/src/test/scala/org/apache/spark/sql/GlutenSQLQueryTestSuite.scala +++ b/gluten-ut/spark34/src/test/scala/org/apache/spark/sql/GlutenSQLQueryTestSuite.scala @@ -74,20 +74,19 @@ import scala.util.control.NonFatal * The format for input files is simple: * 1. A list of SQL queries separated by semicolons by default. If the semicolon cannot * effectively separate the SQL queries in the test file(e.g. bracketed comments), please use - * --QUERY-DELIMITER-START and --QUERY-DELIMITER-END. Lines starting with - * --QUERY-DELIMITER-START and --QUERY-DELIMITER-END represent the beginning and end of a query, - * respectively. Code that is not surrounded by lines that begin with --QUERY-DELIMITER-START and - * --QUERY-DELIMITER-END is still separated by semicolons. 2. Lines starting with -- are treated as - * comments and ignored. 3. Lines starting with --SET are used to specify the configs when running - * this testing file. You can set multiple configs in one --SET, using comma to separate them. Or - * you can use multiple - * --SET statements. 4. Lines starting with --IMPORT are used to load queries from another test - * file. 5. Lines starting with --CONFIG_DIM are used to specify config dimensions of this testing - * file. The dimension name is decided by the string after --CONFIG_DIM. For example, --CONFIG_DIM1 - * belongs to dimension 1. One dimension can have multiple lines, each line representing one config - * set (one or more configs, separated by comma). Spark will run this testing file many times, each - * time picks one config set from each dimension, until all the combinations are tried. For example, - * if dimension 1 has 2 lines, dimension 2 has 3 lines, this testing file will be run 6 times + * --QUERY-DELIMITER-START and --QUERY-DELIMITER-END. Lines starting with --QUERY-DELIMITER-START + * and --QUERY-DELIMITER-END represent the beginning and end of a query, respectively. Code that is + * not surrounded by lines that begin with --QUERY-DELIMITER-START and --QUERY-DELIMITER-END is + * still separated by semicolons. 2. Lines starting with -- are treated as comments and ignored. 3. + * Lines starting with --SET are used to specify the configs when running this testing file. You can + * set multiple configs in one --SET, using comma to separate them. Or you can use multiple --SET + * statements. 4. Lines starting with --IMPORT are used to load queries from another test file. 5. + * Lines starting with --CONFIG_DIM are used to specify config dimensions of this testing file. The + * dimension name is decided by the string after --CONFIG_DIM. For example, --CONFIG_DIM1 belongs to + * dimension 1. One dimension can have multiple lines, each line representing one config set (one or + * more configs, separated by comma). Spark will run this testing file many times, each time picks + * one config set from each dimension, until all the combinations are tried. For example, if + * dimension 1 has 2 lines, dimension 2 has 3 lines, this testing file will be run 6 times * (cartesian product). * * For example: diff --git a/gluten-ut/spark34/src/test/scala/org/apache/spark/sql/catalyst/expressions/GlutenMathExpressionsSuite.scala b/gluten-ut/spark34/src/test/scala/org/apache/spark/sql/catalyst/expressions/GlutenMathExpressionsSuite.scala index e220924880c7a..96c53306bde04 100644 --- a/gluten-ut/spark34/src/test/scala/org/apache/spark/sql/catalyst/expressions/GlutenMathExpressionsSuite.scala +++ b/gluten-ut/spark34/src/test/scala/org/apache/spark/sql/catalyst/expressions/GlutenMathExpressionsSuite.scala @@ -248,6 +248,8 @@ class GlutenMathExpressionsSuite extends MathExpressionsSuite with GlutenTestsTr checkEvaluation(BRound(-3.5, 0), -4.0) checkEvaluation(BRound(-0.35, 1), -0.4) checkEvaluation(BRound(-35, -1), -40) + // Enable the test after fixing https://github.com/apache/incubator-gluten/issues/6827 + // checkEvaluation(Round(0.5549999999999999, 2), 0.55) checkEvaluation(BRound(BigDecimal("45.00"), -1), BigDecimal(40)) checkEvaluation(checkDataTypeAndCast(RoundFloor(Literal(2.5), Literal(0))), Decimal(2)) checkEvaluation(checkDataTypeAndCast(RoundFloor(Literal(3.5), Literal(0))), Decimal(3)) diff --git a/gluten-ut/spark34/src/test/scala/org/apache/spark/sql/execution/FallbackStrategiesSuite.scala b/gluten-ut/spark34/src/test/scala/org/apache/spark/sql/execution/FallbackStrategiesSuite.scala index 5150a47688519..a4da5c127c5ff 100644 --- a/gluten-ut/spark34/src/test/scala/org/apache/spark/sql/execution/FallbackStrategiesSuite.scala +++ b/gluten-ut/spark34/src/test/scala/org/apache/spark/sql/execution/FallbackStrategiesSuite.scala @@ -16,10 +16,12 @@ */ package org.apache.spark.sql.execution -import org.apache.gluten.backendsapi.BackendsApiManager +import org.apache.gluten.GlutenConfig import org.apache.gluten.execution.BasicScanExecTransformer import org.apache.gluten.extension.GlutenPlan -import org.apache.gluten.extension.columnar.{FallbackEmptySchemaRelation, FallbackTags} +import org.apache.gluten.extension.columnar.{ExpandFallbackPolicy, FallbackTags, RemoveFallbackTagRule} +import org.apache.gluten.extension.columnar.ColumnarRuleApplier.ColumnarRuleBuilder +import org.apache.gluten.extension.columnar.MiscColumnarRules.RemoveTopmostColumnarToRow import org.apache.gluten.extension.columnar.heuristic.HeuristicApplier import org.apache.gluten.extension.columnar.transition.InsertTransitions import org.apache.gluten.utils.QueryPlanSelector @@ -28,20 +30,22 @@ import org.apache.spark.rdd.RDD import org.apache.spark.sql.{GlutenSQLTestsTrait, SparkSession} import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.Attribute +import org.apache.spark.sql.catalyst.rules.Rule class FallbackStrategiesSuite extends GlutenSQLTestsTrait { - + import FallbackStrategiesSuite._ testGluten("Fall back the whole query if one unsupported") { withSQLConf(("spark.gluten.sql.columnar.query.fallback.threshold", "1")) { val originalPlan = UnaryOp2(UnaryOp1(UnaryOp2(UnaryOp1(LeafOp())))) - val rule = new HeuristicApplier(spark).withTransformRules( + val rule = newRuleApplier( + spark, List( _ => _ => { UnaryOp2(UnaryOp1Transformer(UnaryOp2(UnaryOp1Transformer(LeafOp())))) }, - (_: SparkSession) => InsertTransitions(outputsColumnar = false))) - val outputPlan = rule.apply(originalPlan) + c => InsertTransitions(c.outputsColumnar))) + val outputPlan = rule.apply(originalPlan, false) // Expect to fall back the entire plan. assert(outputPlan == originalPlan) } @@ -50,16 +54,16 @@ class FallbackStrategiesSuite extends GlutenSQLTestsTrait { testGluten("Fall back the whole plan if meeting the configured threshold") { withSQLConf(("spark.gluten.sql.columnar.wholeStage.fallback.threshold", "1")) { val originalPlan = UnaryOp2(UnaryOp1(UnaryOp2(UnaryOp1(LeafOp())))) - val rule = new HeuristicApplier(spark) + val rule = newRuleApplier( + spark, + List( + _ => + _ => { + UnaryOp2(UnaryOp1Transformer(UnaryOp2(UnaryOp1Transformer(LeafOp())))) + }, + c => InsertTransitions(c.outputsColumnar))) .enableAdaptiveContext() - .withTransformRules( - List( - _ => - _ => { - UnaryOp2(UnaryOp1Transformer(UnaryOp2(UnaryOp1Transformer(LeafOp())))) - }, - (_: SparkSession) => InsertTransitions(outputsColumnar = false))) - val outputPlan = rule.apply(originalPlan) + val outputPlan = rule.apply(originalPlan, false) // Expect to fall back the entire plan. assert(outputPlan == originalPlan) } @@ -68,16 +72,16 @@ class FallbackStrategiesSuite extends GlutenSQLTestsTrait { testGluten("Don't fall back the whole plan if NOT meeting the configured threshold") { withSQLConf(("spark.gluten.sql.columnar.wholeStage.fallback.threshold", "4")) { val originalPlan = UnaryOp2(UnaryOp1(UnaryOp2(UnaryOp1(LeafOp())))) - val rule = new HeuristicApplier(spark) + val rule = newRuleApplier( + spark, + List( + _ => + _ => { + UnaryOp2(UnaryOp1Transformer(UnaryOp2(UnaryOp1Transformer(LeafOp())))) + }, + c => InsertTransitions(c.outputsColumnar))) .enableAdaptiveContext() - .withTransformRules( - List( - _ => - _ => { - UnaryOp2(UnaryOp1Transformer(UnaryOp2(UnaryOp1Transformer(LeafOp())))) - }, - (_: SparkSession) => InsertTransitions(outputsColumnar = false))) - val outputPlan = rule.apply(originalPlan) + val outputPlan = rule.apply(originalPlan, false) // Expect to get the plan with columnar rule applied. assert(outputPlan != originalPlan) } @@ -88,16 +92,16 @@ class FallbackStrategiesSuite extends GlutenSQLTestsTrait { " transformable)") { withSQLConf(("spark.gluten.sql.columnar.wholeStage.fallback.threshold", "2")) { val originalPlan = UnaryOp2(UnaryOp1(UnaryOp2(UnaryOp1(LeafOp())))) - val rule = new HeuristicApplier(spark) + val rule = newRuleApplier( + spark, + List( + _ => + _ => { + UnaryOp2(UnaryOp1Transformer(UnaryOp2(UnaryOp1Transformer(LeafOpTransformer())))) + }, + c => InsertTransitions(c.outputsColumnar))) .enableAdaptiveContext() - .withTransformRules( - List( - _ => - _ => { - UnaryOp2(UnaryOp1Transformer(UnaryOp2(UnaryOp1Transformer(LeafOpTransformer())))) - }, - (_: SparkSession) => InsertTransitions(outputsColumnar = false))) - val outputPlan = rule.apply(originalPlan) + val outputPlan = rule.apply(originalPlan, false) // Expect to fall back the entire plan. assert(outputPlan == originalPlan) } @@ -108,16 +112,16 @@ class FallbackStrategiesSuite extends GlutenSQLTestsTrait { "leaf node is transformable)") { withSQLConf(("spark.gluten.sql.columnar.wholeStage.fallback.threshold", "3")) { val originalPlan = UnaryOp2(UnaryOp1(UnaryOp2(UnaryOp1(LeafOp())))) - val rule = new HeuristicApplier(spark) + val rule = newRuleApplier( + spark, + List( + _ => + _ => { + UnaryOp2(UnaryOp1Transformer(UnaryOp2(UnaryOp1Transformer(LeafOpTransformer())))) + }, + c => InsertTransitions(c.outputsColumnar))) .enableAdaptiveContext() - .withTransformRules( - List( - _ => - _ => { - UnaryOp2(UnaryOp1Transformer(UnaryOp2(UnaryOp1Transformer(LeafOpTransformer())))) - }, - (_: SparkSession) => InsertTransitions(outputsColumnar = false))) - val outputPlan = rule.apply(originalPlan) + val outputPlan = rule.apply(originalPlan, false) // Expect to get the plan with columnar rule applied. assert(outputPlan != originalPlan) } @@ -129,13 +133,9 @@ class FallbackStrategiesSuite extends GlutenSQLTestsTrait { val rule = FallbackEmptySchemaRelation() val newPlan = rule.apply(originalPlan) val reason = FallbackTags.get(newPlan).reason() - if (BackendsApiManager.getSettings.fallbackOnEmptySchema(newPlan)) { - assert( - reason.contains("fake reason") && - reason.contains("at least one of its children has empty output")) - } else { - assert(reason.contains("fake reason")) - } + assert( + reason.contains("fake reason") && + reason.contains("at least one of its children has empty output")) } testGluten("test enabling/disabling Gluten at thread level") { @@ -170,43 +170,79 @@ class FallbackStrategiesSuite extends GlutenSQLTestsTrait { } } -case class LeafOp(override val supportsColumnar: Boolean = false) extends LeafExecNode { - override protected def doExecute(): RDD[InternalRow] = throw new UnsupportedOperationException() - override def output: Seq[Attribute] = Seq.empty -} +private object FallbackStrategiesSuite { + def newRuleApplier( + spark: SparkSession, + transformBuilders: Seq[ColumnarRuleBuilder]): HeuristicApplier = { + new HeuristicApplier( + spark, + transformBuilders, + List(c => ExpandFallbackPolicy(c.ac.isAdaptiveContext(), c.ac.originalPlan())), + List( + c => RemoveTopmostColumnarToRow(c.session, c.ac.isAdaptiveContext()), + _ => ColumnarCollapseTransformStages(GlutenConfig.getConf) + ), + List(_ => RemoveFallbackTagRule()) + ) + } -case class UnaryOp1(child: SparkPlan, override val supportsColumnar: Boolean = false) - extends UnaryExecNode { - override protected def doExecute(): RDD[InternalRow] = throw new UnsupportedOperationException() - override def output: Seq[Attribute] = child.output - override protected def withNewChildInternal(newChild: SparkPlan): UnaryOp1 = - copy(child = newChild) -} + // TODO: Generalize the code among shim versions. + case class FallbackEmptySchemaRelation() extends Rule[SparkPlan] { + override def apply(plan: SparkPlan): SparkPlan = plan.transformDown { + case p => + if (p.children.exists(_.output.isEmpty)) { + // Some backends are not eligible to offload plan with zero-column input. + // If any child have empty output, mark the plan and that child as UNSUPPORTED. + FallbackTags.add(p, "at least one of its children has empty output") + p.children.foreach { + child => + if (child.output.isEmpty) { + FallbackTags.add(child, "at least one of its children has empty output") + } + } + } + p + } + } -case class UnaryOp2(child: SparkPlan, override val supportsColumnar: Boolean = false) - extends UnaryExecNode { - override protected def doExecute(): RDD[InternalRow] = throw new UnsupportedOperationException() - override def output: Seq[Attribute] = child.output - override protected def withNewChildInternal(newChild: SparkPlan): UnaryOp2 = - copy(child = newChild) -} + case class LeafOp(override val supportsColumnar: Boolean = false) extends LeafExecNode { + override protected def doExecute(): RDD[InternalRow] = throw new UnsupportedOperationException() + override def output: Seq[Attribute] = Seq.empty + } -// For replacing LeafOp. -case class LeafOpTransformer(override val supportsColumnar: Boolean = true) - extends LeafExecNode - with GlutenPlan { - override protected def doExecute(): RDD[InternalRow] = throw new UnsupportedOperationException() - override def output: Seq[Attribute] = Seq.empty -} + case class UnaryOp1(child: SparkPlan, override val supportsColumnar: Boolean = false) + extends UnaryExecNode { + override protected def doExecute(): RDD[InternalRow] = throw new UnsupportedOperationException() + override def output: Seq[Attribute] = child.output + override protected def withNewChildInternal(newChild: SparkPlan): UnaryOp1 = + copy(child = newChild) + } + + case class UnaryOp2(child: SparkPlan, override val supportsColumnar: Boolean = false) + extends UnaryExecNode { + override protected def doExecute(): RDD[InternalRow] = throw new UnsupportedOperationException() + override def output: Seq[Attribute] = child.output + override protected def withNewChildInternal(newChild: SparkPlan): UnaryOp2 = + copy(child = newChild) + } + + // For replacing LeafOp. + case class LeafOpTransformer(override val supportsColumnar: Boolean = true) + extends LeafExecNode + with GlutenPlan { + override protected def doExecute(): RDD[InternalRow] = throw new UnsupportedOperationException() + override def output: Seq[Attribute] = Seq.empty + } -// For replacing UnaryOp1. -case class UnaryOp1Transformer( - override val child: SparkPlan, - override val supportsColumnar: Boolean = true) - extends UnaryExecNode - with GlutenPlan { - override protected def doExecute(): RDD[InternalRow] = throw new UnsupportedOperationException() - override def output: Seq[Attribute] = child.output - override protected def withNewChildInternal(newChild: SparkPlan): UnaryOp1Transformer = - copy(child = newChild) + // For replacing UnaryOp1. + case class UnaryOp1Transformer( + override val child: SparkPlan, + override val supportsColumnar: Boolean = true) + extends UnaryExecNode + with GlutenPlan { + override protected def doExecute(): RDD[InternalRow] = throw new UnsupportedOperationException() + override def output: Seq[Attribute] = child.output + override protected def withNewChildInternal(newChild: SparkPlan): UnaryOp1Transformer = + copy(child = newChild) + } } diff --git a/gluten-ut/spark34/src/test/scala/org/apache/spark/sql/execution/datasources/GlutenV1WriteCommandSuite.scala b/gluten-ut/spark34/src/test/scala/org/apache/spark/sql/execution/datasources/GlutenV1WriteCommandSuite.scala index 726ace3a15f16..eb6794bba812c 100644 --- a/gluten-ut/spark34/src/test/scala/org/apache/spark/sql/execution/datasources/GlutenV1WriteCommandSuite.scala +++ b/gluten-ut/spark34/src/test/scala/org/apache/spark/sql/execution/datasources/GlutenV1WriteCommandSuite.scala @@ -151,7 +151,8 @@ class GlutenV1WriteCommandSuite ), false, _, - _) => + _ + ) => true case SortExecTransformer( Seq( @@ -168,7 +169,8 @@ class GlutenV1WriteCommandSuite ), false, _, - _) => + _ + ) => true case _ => false }, @@ -233,7 +235,8 @@ class GlutenV1WriteCommandSuite ), false, _, - _) => + _ + ) => true case SortExecTransformer( Seq( @@ -250,7 +253,8 @@ class GlutenV1WriteCommandSuite ), false, _, - _) => + _ + ) => true case _ => false }, diff --git a/gluten-ut/spark34/src/test/scala/org/apache/spark/sql/extension/GlutenSessionExtensionSuite.scala b/gluten-ut/spark34/src/test/scala/org/apache/spark/sql/extension/GlutenSessionExtensionSuite.scala index 6816534094f32..2ca7429f16792 100644 --- a/gluten-ut/spark34/src/test/scala/org/apache/spark/sql/extension/GlutenSessionExtensionSuite.scala +++ b/gluten-ut/spark34/src/test/scala/org/apache/spark/sql/extension/GlutenSessionExtensionSuite.scala @@ -31,7 +31,8 @@ class GlutenSessionExtensionSuite extends GlutenSQLTestsTrait { } testGluten("test gluten extensions") { - assert(spark.sessionState.columnarRules.contains(ColumnarOverrideRules(spark))) + assert( + spark.sessionState.columnarRules.map(_.getClass).contains(classOf[ColumnarOverrideRules])) assert(spark.sessionState.planner.strategies.contains(MySparkStrategy(spark))) assert(spark.sessionState.analyzer.extendedResolutionRules.contains(MyRule(spark))) diff --git a/gluten-ut/spark34/src/test/scala/org/apache/spark/sql/sources/GlutenInsertSuite.scala b/gluten-ut/spark34/src/test/scala/org/apache/spark/sql/sources/GlutenInsertSuite.scala index 5c60115c5e1d5..ca0ada39ceec4 100644 --- a/gluten-ut/spark34/src/test/scala/org/apache/spark/sql/sources/GlutenInsertSuite.scala +++ b/gluten-ut/spark34/src/test/scala/org/apache/spark/sql/sources/GlutenInsertSuite.scala @@ -16,7 +16,7 @@ */ package org.apache.spark.sql.sources -import org.apache.gluten.execution.SortExecTransformer +import org.apache.gluten.execution.{ProjectExecTransformer, SortExecTransformer} import org.apache.gluten.extension.GlutenPlan import org.apache.spark.SparkConf @@ -147,6 +147,36 @@ class GlutenInsertSuite assert(parts == expectedPartitionNames) } + testGluten("offload empty2null when v1writes fallback") { + withSQLConf((SQLConf.MAX_RECORDS_PER_FILE.key, "1000")) { + withTable("pt") { + spark.sql("CREATE TABLE pt (c1 int) USING PARQUET PARTITIONED BY(p string)") + + val df = spark.sql(s""" + |INSERT OVERWRITE TABLE pt PARTITION(p) + |SELECT c1, c2 as p FROM source + |""".stripMargin) + + val writeFiles = stripAQEPlan( + df.queryExecution.executedPlan + .asInstanceOf[CommandResultExec] + .commandPhysicalPlan).children.head + assert(!writeFiles.isInstanceOf[ColumnarWriteFilesExec]) + assert(writeFiles.exists(_.isInstanceOf[ProjectExecTransformer])) + val projectExecTransformer = writeFiles + .find(_.isInstanceOf[ProjectExecTransformer]) + .get + .asInstanceOf[ProjectExecTransformer] + projectExecTransformer.projectList.find(_.toString().contains("empty2null")) + + // The partition column should never be empty + checkAnswer( + spark.sql("SELECT * FROM pt"), + spark.sql("SELECT c1, if(c2 = '', null, c2) FROM source")) + } + } + } + testGluten("remove v1writes sort and project") { // Only string type has empty2null expression withTable("pt") { diff --git a/gluten-ut/spark35/pom.xml b/gluten-ut/spark35/pom.xml index 4dcb7de6d0a58..1750a5e278164 100644 --- a/gluten-ut/spark35/pom.xml +++ b/gluten-ut/spark35/pom.xml @@ -36,11 +36,6 @@ test tests - - com.fasterxml.jackson.core - jackson-core - 2.15.1 - @@ -62,6 +57,38 @@ ${celeborn.version} test + + org.apache.arrow + arrow-memory-core + ${arrow.version} + provided + + + io.netty + netty-common + + + io.netty + netty-buffer + + + + + org.apache.arrow + arrow-vector + ${arrow.version} + provided + + + io.netty + netty-common + + + io.netty + netty-buffer + + + @@ -98,44 +125,6 @@ ${project.version} test - - org.apache.arrow - arrow-vector - ${arrow.version} - - - io.netty - netty-common - - - io.netty - netty-buffer - - - test - - - org.apache.arrow - arrow-memory-netty - ${arrow.version} - test - - - org.apache.arrow - arrow-memory-core - ${arrow.version} - test - - - io.netty - netty-common - - - io.netty - netty-buffer - - - org.slf4j diff --git a/gluten-ut/spark35/src/test/backends-clickhouse/org/apache/gluten/GlutenColumnarWriteTestSupport.scala b/gluten-ut/spark35/src/test/backends-clickhouse/org/apache/gluten/GlutenColumnarWriteTestSupport.scala index 43b83afe9af37..4258cd891a5a6 100644 --- a/gluten-ut/spark35/src/test/backends-clickhouse/org/apache/gluten/GlutenColumnarWriteTestSupport.scala +++ b/gluten-ut/spark35/src/test/backends-clickhouse/org/apache/gluten/GlutenColumnarWriteTestSupport.scala @@ -16,11 +16,12 @@ */ package org.apache.gluten -import org.apache.spark.sql.execution.SparkPlan +import org.apache.spark.sql.execution.{ColumnarWriteFilesExec, SparkPlan} trait GlutenColumnarWriteTestSupport { def checkWriteFilesAndGetChild(sparkPlan: SparkPlan): SparkPlan = { - throw new UnsupportedOperationException("Clickhouse Backend does not support write files") + assert(sparkPlan.isInstanceOf[ColumnarWriteFilesExec]) + sparkPlan.asInstanceOf[ColumnarWriteFilesExec].child } } diff --git a/gluten-ut/spark35/src/test/backends-clickhouse/org/apache/gluten/execution/parquet/GlutenParquetV1FilterSuite2.scala b/gluten-ut/spark35/src/test/backends-clickhouse/org/apache/gluten/execution/parquet/GlutenParquetV1FilterSuite2.scala new file mode 100644 index 0000000000000..d20a419597d10 --- /dev/null +++ b/gluten-ut/spark35/src/test/backends-clickhouse/org/apache/gluten/execution/parquet/GlutenParquetV1FilterSuite2.scala @@ -0,0 +1,30 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.gluten.execution.parquet + +import org.apache.spark.SparkConf +import org.apache.spark.sql.execution.datasources.parquet.GlutenParquetV1FilterSuite + + +/** testing use_local_format parquet reader. + * FIXME: Run this suite in Spark 35 CI Pipeline + * */ +class GlutenParquetV1FilterSuite2 extends GlutenParquetV1FilterSuite { + override def sparkConf: SparkConf = + super.sparkConf + .set("spark.gluten.sql.columnar.backend.ch.runtime_config.use_local_format", "true") +} diff --git a/gluten-ut/spark35/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala b/gluten-ut/spark35/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala index ab288e835b127..bf971aba7282a 100644 --- a/gluten-ut/spark35/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala +++ b/gluten-ut/spark35/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala @@ -775,6 +775,7 @@ class ClickHouseTestSettings extends BackendTestSettings { .exclude("REPEAT") .exclude("ParseUrl") .exclude("SPARK-33468: ParseUrl in ANSI mode should fail if input string is not a valid url") + .exclude("FORMAT") // refer https://github.com/apache/incubator-gluten/issues/6765 enableSuite[GlutenDataSourceV2DataFrameSessionCatalogSuite] enableSuite[GlutenDataSourceV2SQLSessionCatalogSuite] enableSuite[GlutenDataSourceV2SQLSuiteV1Filter] @@ -1437,6 +1438,8 @@ class ClickHouseTestSettings extends BackendTestSettings { .exclude("SPARK-34562: Bloom filter push down") .exclude("SPARK-38825: in and notIn filters") .exclude("SPARK-36866: filter pushdown - year-month interval") + .exclude("filter pushdown - StringContains") + .exclude("filter pushdown - StringPredicate") .excludeGlutenTest("SPARK-25207: exception when duplicate fields in case-insensitive mode") enableSuite[GlutenParquetV1PartitionDiscoverySuite] .exclude("SPARK-7847: Dynamic partition directory path escaping and unescaping") diff --git a/gluten-ut/spark35/src/test/scala/org/apache/spark/sql/GlutenColumnExpressionSuite.scala b/gluten-ut/spark35/src/test/scala/org/apache/spark/sql/GlutenColumnExpressionSuite.scala index 437cef29215ce..dbc70bf74598d 100644 --- a/gluten-ut/spark35/src/test/scala/org/apache/spark/sql/GlutenColumnExpressionSuite.scala +++ b/gluten-ut/spark35/src/test/scala/org/apache/spark/sql/GlutenColumnExpressionSuite.scala @@ -34,7 +34,7 @@ class GlutenColumnExpressionSuite extends ColumnExpressionSuite with GlutenSQLTe strDf.select(raise_error($"a")).collect() } assert(e2.getCause.isInstanceOf[RuntimeException]) - assert(e2.getCause.getMessage contains "hello") + assert(e2.getCause.getMessage.contains("hello")) } testGluten("assert_true") { @@ -58,7 +58,7 @@ class GlutenColumnExpressionSuite extends ColumnExpressionSuite with GlutenSQLTe nullDf.select(assert_true($"cond", $"n")).collect() } assert(e2.getCause.isInstanceOf[RuntimeException]) - assert(e2.getCause.getMessage contains "first row") + assert(e2.getCause.getMessage.contains("first row")) // assert_true(condition) val intDf = Seq((0, 1)).toDF("a", "b") @@ -67,7 +67,7 @@ class GlutenColumnExpressionSuite extends ColumnExpressionSuite with GlutenSQLTe intDf.select(assert_true($"a" > $"b")).collect() } assert(e3.getCause.isInstanceOf[RuntimeException]) - assert(e3.getCause.getMessage contains "'('a > 'b)' is not true!") + assert(e3.getCause.getMessage.contains("'('a > 'b)' is not true!")) } testGluten( diff --git a/gluten-ut/spark35/src/test/scala/org/apache/spark/sql/GlutenSQLQueryTestSuite.scala b/gluten-ut/spark35/src/test/scala/org/apache/spark/sql/GlutenSQLQueryTestSuite.scala index 8a6f5f32f8919..276bc95be586e 100644 --- a/gluten-ut/spark35/src/test/scala/org/apache/spark/sql/GlutenSQLQueryTestSuite.scala +++ b/gluten-ut/spark35/src/test/scala/org/apache/spark/sql/GlutenSQLQueryTestSuite.scala @@ -74,20 +74,19 @@ import scala.util.control.NonFatal * The format for input files is simple: * 1. A list of SQL queries separated by semicolons by default. If the semicolon cannot * effectively separate the SQL queries in the test file(e.g. bracketed comments), please use - * --QUERY-DELIMITER-START and --QUERY-DELIMITER-END. Lines starting with - * --QUERY-DELIMITER-START and --QUERY-DELIMITER-END represent the beginning and end of a query, - * respectively. Code that is not surrounded by lines that begin with --QUERY-DELIMITER-START and - * --QUERY-DELIMITER-END is still separated by semicolons. 2. Lines starting with -- are treated as - * comments and ignored. 3. Lines starting with --SET are used to specify the configs when running - * this testing file. You can set multiple configs in one --SET, using comma to separate them. Or - * you can use multiple - * --SET statements. 4. Lines starting with --IMPORT are used to load queries from another test - * file. 5. Lines starting with --CONFIG_DIM are used to specify config dimensions of this testing - * file. The dimension name is decided by the string after --CONFIG_DIM. For example, --CONFIG_DIM1 - * belongs to dimension 1. One dimension can have multiple lines, each line representing one config - * set (one or more configs, separated by comma). Spark will run this testing file many times, each - * time picks one config set from each dimension, until all the combinations are tried. For example, - * if dimension 1 has 2 lines, dimension 2 has 3 lines, this testing file will be run 6 times + * --QUERY-DELIMITER-START and --QUERY-DELIMITER-END. Lines starting with --QUERY-DELIMITER-START + * and --QUERY-DELIMITER-END represent the beginning and end of a query, respectively. Code that is + * not surrounded by lines that begin with --QUERY-DELIMITER-START and --QUERY-DELIMITER-END is + * still separated by semicolons. 2. Lines starting with -- are treated as comments and ignored. 3. + * Lines starting with --SET are used to specify the configs when running this testing file. You can + * set multiple configs in one --SET, using comma to separate them. Or you can use multiple --SET + * statements. 4. Lines starting with --IMPORT are used to load queries from another test file. 5. + * Lines starting with --CONFIG_DIM are used to specify config dimensions of this testing file. The + * dimension name is decided by the string after --CONFIG_DIM. For example, --CONFIG_DIM1 belongs to + * dimension 1. One dimension can have multiple lines, each line representing one config set (one or + * more configs, separated by comma). Spark will run this testing file many times, each time picks + * one config set from each dimension, until all the combinations are tried. For example, if + * dimension 1 has 2 lines, dimension 2 has 3 lines, this testing file will be run 6 times * (cartesian product). * * For example: diff --git a/gluten-ut/spark35/src/test/scala/org/apache/spark/sql/catalyst/expressions/GlutenMathExpressionsSuite.scala b/gluten-ut/spark35/src/test/scala/org/apache/spark/sql/catalyst/expressions/GlutenMathExpressionsSuite.scala index e220924880c7a..5196e760fcd05 100644 --- a/gluten-ut/spark35/src/test/scala/org/apache/spark/sql/catalyst/expressions/GlutenMathExpressionsSuite.scala +++ b/gluten-ut/spark35/src/test/scala/org/apache/spark/sql/catalyst/expressions/GlutenMathExpressionsSuite.scala @@ -248,6 +248,11 @@ class GlutenMathExpressionsSuite extends MathExpressionsSuite with GlutenTestsTr checkEvaluation(BRound(-3.5, 0), -4.0) checkEvaluation(BRound(-0.35, 1), -0.4) checkEvaluation(BRound(-35, -1), -40) + checkEvaluation(Round(1.12345678901234567, 8), 1.12345679) + checkEvaluation(Round(-0.98765432109876543, 5), -0.98765) + checkEvaluation(Round(12345.67890123456789, 6), 12345.678901) + // Enable the test after fixing https://github.com/apache/incubator-gluten/issues/6827 + // checkEvaluation(Round(0.5549999999999999, 2), 0.55) checkEvaluation(BRound(BigDecimal("45.00"), -1), BigDecimal(40)) checkEvaluation(checkDataTypeAndCast(RoundFloor(Literal(2.5), Literal(0))), Decimal(2)) checkEvaluation(checkDataTypeAndCast(RoundFloor(Literal(3.5), Literal(0))), Decimal(3)) diff --git a/gluten-ut/spark35/src/test/scala/org/apache/spark/sql/execution/FallbackStrategiesSuite.scala b/gluten-ut/spark35/src/test/scala/org/apache/spark/sql/execution/FallbackStrategiesSuite.scala index 5150a47688519..bbdeebfe6a134 100644 --- a/gluten-ut/spark35/src/test/scala/org/apache/spark/sql/execution/FallbackStrategiesSuite.scala +++ b/gluten-ut/spark35/src/test/scala/org/apache/spark/sql/execution/FallbackStrategiesSuite.scala @@ -16,10 +16,12 @@ */ package org.apache.spark.sql.execution -import org.apache.gluten.backendsapi.BackendsApiManager +import org.apache.gluten.GlutenConfig import org.apache.gluten.execution.BasicScanExecTransformer import org.apache.gluten.extension.GlutenPlan -import org.apache.gluten.extension.columnar.{FallbackEmptySchemaRelation, FallbackTags} +import org.apache.gluten.extension.columnar.{ExpandFallbackPolicy, FallbackTags, RemoveFallbackTagRule} +import org.apache.gluten.extension.columnar.ColumnarRuleApplier.ColumnarRuleBuilder +import org.apache.gluten.extension.columnar.MiscColumnarRules.RemoveTopmostColumnarToRow import org.apache.gluten.extension.columnar.heuristic.HeuristicApplier import org.apache.gluten.extension.columnar.transition.InsertTransitions import org.apache.gluten.utils.QueryPlanSelector @@ -28,20 +30,23 @@ import org.apache.spark.rdd.RDD import org.apache.spark.sql.{GlutenSQLTestsTrait, SparkSession} import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.Attribute +import org.apache.spark.sql.catalyst.rules.Rule class FallbackStrategiesSuite extends GlutenSQLTestsTrait { + import FallbackStrategiesSuite._ testGluten("Fall back the whole query if one unsupported") { withSQLConf(("spark.gluten.sql.columnar.query.fallback.threshold", "1")) { val originalPlan = UnaryOp2(UnaryOp1(UnaryOp2(UnaryOp1(LeafOp())))) - val rule = new HeuristicApplier(spark).withTransformRules( + val rule = newRuleApplier( + spark, List( _ => _ => { UnaryOp2(UnaryOp1Transformer(UnaryOp2(UnaryOp1Transformer(LeafOp())))) }, - (_: SparkSession) => InsertTransitions(outputsColumnar = false))) - val outputPlan = rule.apply(originalPlan) + c => InsertTransitions(c.outputsColumnar))) + val outputPlan = rule.apply(originalPlan, false) // Expect to fall back the entire plan. assert(outputPlan == originalPlan) } @@ -50,16 +55,16 @@ class FallbackStrategiesSuite extends GlutenSQLTestsTrait { testGluten("Fall back the whole plan if meeting the configured threshold") { withSQLConf(("spark.gluten.sql.columnar.wholeStage.fallback.threshold", "1")) { val originalPlan = UnaryOp2(UnaryOp1(UnaryOp2(UnaryOp1(LeafOp())))) - val rule = new HeuristicApplier(spark) + val rule = newRuleApplier( + spark, + List( + _ => + _ => { + UnaryOp2(UnaryOp1Transformer(UnaryOp2(UnaryOp1Transformer(LeafOp())))) + }, + c => InsertTransitions(c.outputsColumnar))) .enableAdaptiveContext() - .withTransformRules( - List( - _ => - _ => { - UnaryOp2(UnaryOp1Transformer(UnaryOp2(UnaryOp1Transformer(LeafOp())))) - }, - (_: SparkSession) => InsertTransitions(outputsColumnar = false))) - val outputPlan = rule.apply(originalPlan) + val outputPlan = rule.apply(originalPlan, false) // Expect to fall back the entire plan. assert(outputPlan == originalPlan) } @@ -68,16 +73,16 @@ class FallbackStrategiesSuite extends GlutenSQLTestsTrait { testGluten("Don't fall back the whole plan if NOT meeting the configured threshold") { withSQLConf(("spark.gluten.sql.columnar.wholeStage.fallback.threshold", "4")) { val originalPlan = UnaryOp2(UnaryOp1(UnaryOp2(UnaryOp1(LeafOp())))) - val rule = new HeuristicApplier(spark) + val rule = newRuleApplier( + spark, + List( + _ => + _ => { + UnaryOp2(UnaryOp1Transformer(UnaryOp2(UnaryOp1Transformer(LeafOp())))) + }, + c => InsertTransitions(c.outputsColumnar))) .enableAdaptiveContext() - .withTransformRules( - List( - _ => - _ => { - UnaryOp2(UnaryOp1Transformer(UnaryOp2(UnaryOp1Transformer(LeafOp())))) - }, - (_: SparkSession) => InsertTransitions(outputsColumnar = false))) - val outputPlan = rule.apply(originalPlan) + val outputPlan = rule.apply(originalPlan, false) // Expect to get the plan with columnar rule applied. assert(outputPlan != originalPlan) } @@ -88,16 +93,16 @@ class FallbackStrategiesSuite extends GlutenSQLTestsTrait { " transformable)") { withSQLConf(("spark.gluten.sql.columnar.wholeStage.fallback.threshold", "2")) { val originalPlan = UnaryOp2(UnaryOp1(UnaryOp2(UnaryOp1(LeafOp())))) - val rule = new HeuristicApplier(spark) + val rule = newRuleApplier( + spark, + List( + _ => + _ => { + UnaryOp2(UnaryOp1Transformer(UnaryOp2(UnaryOp1Transformer(LeafOpTransformer())))) + }, + c => InsertTransitions(c.outputsColumnar))) .enableAdaptiveContext() - .withTransformRules( - List( - _ => - _ => { - UnaryOp2(UnaryOp1Transformer(UnaryOp2(UnaryOp1Transformer(LeafOpTransformer())))) - }, - (_: SparkSession) => InsertTransitions(outputsColumnar = false))) - val outputPlan = rule.apply(originalPlan) + val outputPlan = rule.apply(originalPlan, false) // Expect to fall back the entire plan. assert(outputPlan == originalPlan) } @@ -108,16 +113,16 @@ class FallbackStrategiesSuite extends GlutenSQLTestsTrait { "leaf node is transformable)") { withSQLConf(("spark.gluten.sql.columnar.wholeStage.fallback.threshold", "3")) { val originalPlan = UnaryOp2(UnaryOp1(UnaryOp2(UnaryOp1(LeafOp())))) - val rule = new HeuristicApplier(spark) + val rule = newRuleApplier( + spark, + List( + _ => + _ => { + UnaryOp2(UnaryOp1Transformer(UnaryOp2(UnaryOp1Transformer(LeafOpTransformer())))) + }, + c => InsertTransitions(c.outputsColumnar))) .enableAdaptiveContext() - .withTransformRules( - List( - _ => - _ => { - UnaryOp2(UnaryOp1Transformer(UnaryOp2(UnaryOp1Transformer(LeafOpTransformer())))) - }, - (_: SparkSession) => InsertTransitions(outputsColumnar = false))) - val outputPlan = rule.apply(originalPlan) + val outputPlan = rule.apply(originalPlan, false) // Expect to get the plan with columnar rule applied. assert(outputPlan != originalPlan) } @@ -129,13 +134,9 @@ class FallbackStrategiesSuite extends GlutenSQLTestsTrait { val rule = FallbackEmptySchemaRelation() val newPlan = rule.apply(originalPlan) val reason = FallbackTags.get(newPlan).reason() - if (BackendsApiManager.getSettings.fallbackOnEmptySchema(newPlan)) { - assert( - reason.contains("fake reason") && - reason.contains("at least one of its children has empty output")) - } else { - assert(reason.contains("fake reason")) - } + assert( + reason.contains("fake reason") && + reason.contains("at least one of its children has empty output")) } testGluten("test enabling/disabling Gluten at thread level") { @@ -170,43 +171,79 @@ class FallbackStrategiesSuite extends GlutenSQLTestsTrait { } } -case class LeafOp(override val supportsColumnar: Boolean = false) extends LeafExecNode { - override protected def doExecute(): RDD[InternalRow] = throw new UnsupportedOperationException() - override def output: Seq[Attribute] = Seq.empty -} +private object FallbackStrategiesSuite { + def newRuleApplier( + spark: SparkSession, + transformBuilders: Seq[ColumnarRuleBuilder]): HeuristicApplier = { + new HeuristicApplier( + spark, + transformBuilders, + List(c => ExpandFallbackPolicy(c.ac.isAdaptiveContext(), c.ac.originalPlan())), + List( + c => RemoveTopmostColumnarToRow(c.session, c.ac.isAdaptiveContext()), + _ => ColumnarCollapseTransformStages(GlutenConfig.getConf) + ), + List(_ => RemoveFallbackTagRule()) + ) + } -case class UnaryOp1(child: SparkPlan, override val supportsColumnar: Boolean = false) - extends UnaryExecNode { - override protected def doExecute(): RDD[InternalRow] = throw new UnsupportedOperationException() - override def output: Seq[Attribute] = child.output - override protected def withNewChildInternal(newChild: SparkPlan): UnaryOp1 = - copy(child = newChild) -} + // TODO: Generalize the code among shim versions. + case class FallbackEmptySchemaRelation() extends Rule[SparkPlan] { + override def apply(plan: SparkPlan): SparkPlan = plan.transformDown { + case p => + if (p.children.exists(_.output.isEmpty)) { + // Some backends are not eligible to offload plan with zero-column input. + // If any child have empty output, mark the plan and that child as UNSUPPORTED. + FallbackTags.add(p, "at least one of its children has empty output") + p.children.foreach { + child => + if (child.output.isEmpty) { + FallbackTags.add(child, "at least one of its children has empty output") + } + } + } + p + } + } -case class UnaryOp2(child: SparkPlan, override val supportsColumnar: Boolean = false) - extends UnaryExecNode { - override protected def doExecute(): RDD[InternalRow] = throw new UnsupportedOperationException() - override def output: Seq[Attribute] = child.output - override protected def withNewChildInternal(newChild: SparkPlan): UnaryOp2 = - copy(child = newChild) -} + case class LeafOp(override val supportsColumnar: Boolean = false) extends LeafExecNode { + override protected def doExecute(): RDD[InternalRow] = throw new UnsupportedOperationException() + override def output: Seq[Attribute] = Seq.empty + } + + case class UnaryOp1(child: SparkPlan, override val supportsColumnar: Boolean = false) + extends UnaryExecNode { + override protected def doExecute(): RDD[InternalRow] = throw new UnsupportedOperationException() + override def output: Seq[Attribute] = child.output + override protected def withNewChildInternal(newChild: SparkPlan): UnaryOp1 = + copy(child = newChild) + } + + case class UnaryOp2(child: SparkPlan, override val supportsColumnar: Boolean = false) + extends UnaryExecNode { + override protected def doExecute(): RDD[InternalRow] = throw new UnsupportedOperationException() + override def output: Seq[Attribute] = child.output + override protected def withNewChildInternal(newChild: SparkPlan): UnaryOp2 = + copy(child = newChild) + } // For replacing LeafOp. -case class LeafOpTransformer(override val supportsColumnar: Boolean = true) - extends LeafExecNode - with GlutenPlan { - override protected def doExecute(): RDD[InternalRow] = throw new UnsupportedOperationException() - override def output: Seq[Attribute] = Seq.empty -} + case class LeafOpTransformer(override val supportsColumnar: Boolean = true) + extends LeafExecNode + with GlutenPlan { + override protected def doExecute(): RDD[InternalRow] = throw new UnsupportedOperationException() + override def output: Seq[Attribute] = Seq.empty + } // For replacing UnaryOp1. -case class UnaryOp1Transformer( - override val child: SparkPlan, - override val supportsColumnar: Boolean = true) - extends UnaryExecNode - with GlutenPlan { - override protected def doExecute(): RDD[InternalRow] = throw new UnsupportedOperationException() - override def output: Seq[Attribute] = child.output - override protected def withNewChildInternal(newChild: SparkPlan): UnaryOp1Transformer = - copy(child = newChild) + case class UnaryOp1Transformer( + override val child: SparkPlan, + override val supportsColumnar: Boolean = true) + extends UnaryExecNode + with GlutenPlan { + override protected def doExecute(): RDD[InternalRow] = throw new UnsupportedOperationException() + override def output: Seq[Attribute] = child.output + override protected def withNewChildInternal(newChild: SparkPlan): UnaryOp1Transformer = + copy(child = newChild) + } } diff --git a/gluten-ut/spark35/src/test/scala/org/apache/spark/sql/execution/datasources/GlutenV1WriteCommandSuite.scala b/gluten-ut/spark35/src/test/scala/org/apache/spark/sql/execution/datasources/GlutenV1WriteCommandSuite.scala index fcaf75a4d5c18..5fc887d8d410c 100644 --- a/gluten-ut/spark35/src/test/scala/org/apache/spark/sql/execution/datasources/GlutenV1WriteCommandSuite.scala +++ b/gluten-ut/spark35/src/test/scala/org/apache/spark/sql/execution/datasources/GlutenV1WriteCommandSuite.scala @@ -152,7 +152,8 @@ class GlutenV1WriteCommandSuite ), false, _, - _) => + _ + ) => true case SortExecTransformer( Seq( @@ -169,7 +170,8 @@ class GlutenV1WriteCommandSuite ), false, _, - _) => + _ + ) => true case _ => false }, @@ -233,7 +235,8 @@ class GlutenV1WriteCommandSuite ), false, _, - _) => + _ + ) => true case SortExecTransformer( Seq( @@ -250,7 +253,8 @@ class GlutenV1WriteCommandSuite ), false, _, - _) => + _ + ) => true case _ => false }, diff --git a/gluten-ut/spark35/src/test/scala/org/apache/spark/sql/extension/GlutenSessionExtensionSuite.scala b/gluten-ut/spark35/src/test/scala/org/apache/spark/sql/extension/GlutenSessionExtensionSuite.scala index 6816534094f32..2ca7429f16792 100644 --- a/gluten-ut/spark35/src/test/scala/org/apache/spark/sql/extension/GlutenSessionExtensionSuite.scala +++ b/gluten-ut/spark35/src/test/scala/org/apache/spark/sql/extension/GlutenSessionExtensionSuite.scala @@ -31,7 +31,8 @@ class GlutenSessionExtensionSuite extends GlutenSQLTestsTrait { } testGluten("test gluten extensions") { - assert(spark.sessionState.columnarRules.contains(ColumnarOverrideRules(spark))) + assert( + spark.sessionState.columnarRules.map(_.getClass).contains(classOf[ColumnarOverrideRules])) assert(spark.sessionState.planner.strategies.contains(MySparkStrategy(spark))) assert(spark.sessionState.analyzer.extendedResolutionRules.contains(MyRule(spark))) diff --git a/gluten-ut/test/pom.xml b/gluten-ut/test/pom.xml index 52761e988e1e5..fb637d5489374 100644 --- a/gluten-ut/test/pom.xml +++ b/gluten-ut/test/pom.xml @@ -29,13 +29,6 @@ test tests - - org.apache.gluten - gluten-core - ${project.version} - test-jar - test - @@ -57,24 +50,11 @@ ${celeborn.version} test - - - - backends-velox - - false - - - - org.apache.gluten - backends-velox - ${project.version} - test - org.apache.arrow - arrow-vector + arrow-memory-core ${arrow.version} + provided io.netty @@ -85,19 +65,12 @@ netty-buffer - test - - - org.apache.arrow - arrow-memory-netty - ${arrow.version} - test org.apache.arrow - arrow-memory-core + arrow-vector ${arrow.version} - test + provided io.netty @@ -110,6 +83,20 @@ + + + backends-velox + + false + + + + org.apache.gluten + backends-velox + ${project.version} + test + + diff --git a/gluten-ut/test/src/test/scala/org/apache/spark/sql/GlutenExpressionDataTypesValidation.scala b/gluten-ut/test/src/test/scala/org/apache/spark/sql/GlutenExpressionDataTypesValidation.scala index c8b2aaba280f5..d2a9611471ab1 100644 --- a/gluten-ut/test/src/test/scala/org/apache/spark/sql/GlutenExpressionDataTypesValidation.scala +++ b/gluten-ut/test/src/test/scala/org/apache/spark/sql/GlutenExpressionDataTypesValidation.scala @@ -24,11 +24,13 @@ import org.apache.gluten.utils.{BackendTestUtils, SystemParameters} import org.apache.spark.SparkConf import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow -import org.apache.spark.sql.catalyst.expressions._ +import org.apache.spark.sql.catalyst.expressions.{BinaryArrayExpressionWithImplicitCast, _} import org.apache.spark.sql.execution.LeafExecNode import org.apache.spark.sql.execution.metric.SQLMetric import org.apache.spark.sql.types._ +import scala.collection.mutable.Buffer + class GlutenExpressionDataTypesValidation extends WholeStageTransformerSuite { protected val resourcePath: String = null protected val fileFormat: String = null @@ -61,6 +63,7 @@ class GlutenExpressionDataTypesValidation extends WholeStageTransformerSuite { private val allPrimitiveDataTypes: Seq[DataType] = Seq( + BooleanType, ByteType, ShortType, IntegerType, @@ -71,7 +74,8 @@ class GlutenExpressionDataTypesValidation extends WholeStageTransformerSuite { StringType, BinaryType, DateType, - TimestampType) + TimestampType, + NullType) private val allComplexDataTypes: Seq[DataType] = Seq( // Currently, only check certain inner types, assuming they are representative @@ -83,6 +87,7 @@ class GlutenExpressionDataTypesValidation extends WholeStageTransformerSuite { def generateChildExpression(t: DataType): Expression = { t match { + case _: BooleanType => Literal(true, t) case _: IntegralType => Literal(null, t) case _: FractionalType => Literal(null, t) case StringType | BinaryType => Literal("123") @@ -91,6 +96,7 @@ class GlutenExpressionDataTypesValidation extends WholeStageTransformerSuite { case ArrayType(_, _) => Literal(null, t) case MapType(_, _, _) => Literal(null, t) case StructType(_) => Literal(null, t) + case NullType => Literal(null, t) case _ => throw new UnsupportedOperationException("Not supported type: " + t) } } @@ -99,6 +105,19 @@ class GlutenExpressionDataTypesValidation extends WholeStageTransformerSuite { ProjectExecTransformer(namedExpr, DummyPlan()) } + def validateExpr(targetExpr: Expression): Unit = { + val glutenProject = generateGlutenProjectPlan(targetExpr) + if (targetExpr.resolved && glutenProject.doValidate().ok()) { + logInfo( + "## validation passes: " + targetExpr.getClass.getSimpleName + "(" + + targetExpr.children.map(_.dataType.toString).mkString(", ") + ")") + } else { + logInfo( + "!! validation fails: " + targetExpr.getClass.getSimpleName + "(" + + targetExpr.children.map(_.dataType.toString).mkString(", ") + ")") + } + } + test("cast") { for (from <- allPrimitiveDataTypes ++ allComplexDataTypes) { for (to <- allPrimitiveDataTypes ++ allComplexDataTypes) { @@ -120,21 +139,34 @@ class GlutenExpressionDataTypesValidation extends WholeStageTransformerSuite { test("unary expressions with expected input types") { val functionRegistry = spark.sessionState.functionRegistry val sparkBuiltInFunctions = functionRegistry.listFunction() + val exceptionalList: Buffer[Expression] = Buffer() + for (func <- sparkBuiltInFunctions) { val builder = functionRegistry.lookupFunctionBuilder(func).get - var expr: Expression = null - try { - // Instantiate an expression with null input. Just for obtaining the instance for checking - // its allowed input types. - expr = builder(Seq(null)) - } catch { - // Ignore the exception as some expression builders require more than one input. - case _: Throwable => + val expr: Expression = { + try { + // Instantiate an expression with null input. Just for obtaining the instance for checking + // its allowed input types. + builder(Seq(null)) + } catch { + // Ignore the exception as some expression builders require more than one input. + case _: Throwable => null + } + } + val needsValidation = if (expr == null) { + false + } else { + expr match { + // Validated separately. + case _: Cast => false + case _: ExpectsInputTypes if expr.isInstanceOf[UnaryExpression] => true + case _ => + exceptionalList += expr + false + } } - if ( - expr != null && expr.isInstanceOf[ExpectsInputTypes] && expr.isInstanceOf[UnaryExpression] - ) { - val acceptedTypes = allPrimitiveDataTypes.filter( + if (needsValidation) { + val acceptedTypes = allPrimitiveDataTypes ++ allComplexDataTypes.filter( expr.asInstanceOf[ExpectsInputTypes].inputTypes.head.acceptsType(_)) if (acceptedTypes.isEmpty) { logWarning("Any given type is not accepted for " + expr.getClass.getSimpleName) @@ -144,15 +176,97 @@ class GlutenExpressionDataTypesValidation extends WholeStageTransformerSuite { val child = generateChildExpression(t) // Builds an expression whose child's type is really accepted in Spark. val targetExpr = builder(Seq(child)) - val glutenProject = generateGlutenProjectPlan(targetExpr) - if (targetExpr.resolved && glutenProject.doValidate().ok()) { - logInfo("## validation passes: " + targetExpr.getClass.getSimpleName + "(" + t + ")") - } else { - logInfo("!! validation fails: " + targetExpr.getClass.getSimpleName + "(" + t + ")") - } + validateExpr(targetExpr) }) } } + + logWarning("Exceptional list:\n" + exceptionalList.mkString(", ")) } + def hasImplicitCast(expr: Expression): Boolean = expr match { + case _: ImplicitCastInputTypes => true + case _: BinaryOperator => true + case _ => false + } + + test("binary expressions with expected input types") { + val functionRegistry = spark.sessionState.functionRegistry + val exceptionalList: Buffer[Expression] = Buffer() + + val sparkBuiltInFunctions = functionRegistry.listFunction() + sparkBuiltInFunctions.foreach( + func => { + val builder = functionRegistry.lookupFunctionBuilder(func).get + val expr: Expression = { + try { + // Instantiate an expression with null input. Just for obtaining the instance for + // checking its allowed input types. + builder(Seq(null, null)) + } catch { + // Ignore the exception as some expression builders that don't require exact two input. + case _: Throwable => null + } + } + val needsValidation = if (expr == null) { + false + } else { + expr match { + // Requires left/right child's DataType to determine inputTypes. + case _: BinaryArrayExpressionWithImplicitCast => + exceptionalList += expr + false + case _: ExpectsInputTypes if expr.isInstanceOf[BinaryExpression] => true + case _ => + exceptionalList += expr + false + } + } + + if (needsValidation) { + var acceptedLeftTypes: Seq[DataType] = Seq.empty + var acceptedRightTypes: Seq[DataType] = Seq.empty + try { + acceptedLeftTypes = allPrimitiveDataTypes ++ allComplexDataTypes.filter( + expr.asInstanceOf[ExpectsInputTypes].inputTypes(0).acceptsType(_)) + acceptedRightTypes = allPrimitiveDataTypes ++ allComplexDataTypes.filter( + expr.asInstanceOf[ExpectsInputTypes].inputTypes(1).acceptsType(_)) + } catch { + case _: java.lang.NullPointerException => + } + + if (acceptedLeftTypes.isEmpty || acceptedRightTypes.isEmpty) { + logWarning("Any given type is not accepted for " + expr.getClass.getSimpleName) + } + val leftChildList = acceptedLeftTypes.map( + t => { + generateChildExpression(t) + }) + if (hasImplicitCast(expr)) { + leftChildList.foreach( + left => { + // Spark's implicit cast makes same input types. + val targetExpr = builder(Seq(left, left)) + validateExpr(targetExpr) + }) + } else { + val rightChildList = acceptedRightTypes.map( + t => { + generateChildExpression(t) + }) + leftChildList.foreach( + left => { + rightChildList.foreach( + right => { + // Builds an expression whose child's type is really accepted in Spark. + val targetExpr = builder(Seq(left, right)) + validateExpr(targetExpr) + }) + }) + } + } + }) + + logWarning("Exceptional list:\n" + exceptionalList.mkString(", ")) + } } diff --git a/package/pom.xml b/package/pom.xml index 794118f833ce9..f385a2a5a0586 100644 --- a/package/pom.xml +++ b/package/pom.xml @@ -93,6 +93,16 @@ + + delta + + + org.apache.gluten + gluten-delta + ${project.version} + + + @@ -176,7 +186,12 @@ META-INF/*.SF META-INF/*.DSA - META-INF/*.RSA + META-INF/*.RSA + META-INF/DEPENDENCIES + META-INF/LICENSE.txt + META-INF/NOTICE.txt + LICENSE.txt + NOTICE.txt @@ -320,6 +335,12 @@ org.apache.spark.sql.execution.datasources.WriterBucketSpec$ org.apache.spark.sql.execution.datasources.InsertIntoHadoopFsRelationCommand org.apache.spark.sql.execution.datasources.InsertIntoHadoopFsRelationCommand$ + org.apache.spark.util.collection.unsafe.sort.UnsafeExternalSorter + org.apache.spark.util.collection.unsafe.sort.UnsafeExternalSorter$ + org.apache.spark.util.collection.unsafe.sort.UnsafeExternalSorter$SpillableIterator + org.apache.spark.util.collection.unsafe.sort.UnsafeExternalSorter$ChainedIterator + org.apache.spark.memory.MemoryConsumer + org.apache.spark.memory.TaskMemoryManager com.google.protobuf.* diff --git a/package/src/main/resources/META-INF/LICENSE b/package/src/main/resources/META-INF/LICENSE new file mode 100644 index 0000000000000..3680275b939a2 --- /dev/null +++ b/package/src/main/resources/META-INF/LICENSE @@ -0,0 +1,262 @@ + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + +--------------------------------------------------------- + +This project bundles some components that are also licensed under the Apache +License Version 2.0: + +com.fasterxml.jackson.core:jackson-core +com.fasterxml.jackson.core:jackson-databind +com.fasterxml.jackson.datatype:jackson-datatype-jsr310 +com.fasterxml.jackson.module:jackson-module-scala_2.12 +com.google.code.findbugs:jsr305 +com.google.code.gson:gson +com.google.errorprone:error_prone_annotations +com.google.flatbuffers:flatbuffers-java +com.google.guava:failureaccess +com.google.guava:guava +com.google.guava:listenablefuture +com.google.j2objc:j2objc-annotations +com.google.jimfs:jimfs +com.github.ben-manes.caffeine:caffeine +commons-codec:commons-codec +info.picocli:picocli +io.trino.tpcds:tpcds +io.trino.tpch:tpch +javax.inject:javax.inject +org.scala-lang:scala-library +org.apache.arrow:arrow-format +org.apache.arrow:arrow-memory-core +org.apache.arrow:arrow-memory-unsafe +org.apache.arrow:arrow-vector + +--------------------------------------------------------- + +This product bundles various third-party components under other open source licenses. +This section summarizes those components and their licenses. See licenses-binary/ +for text of these licenses. + +BSD 3-Clause +------------ + +com.thoughtworks.paranamer:paranamer +io.glutenproject:protobuf-java +io.glutenproject:protobuf-java-util +org.eclipse.collections:eclipse-collections +org.eclipse.collections:eclipse-collections-api + + +MIT License +----------- + +org.checkerframework:checker-qual +org.slf4j:slf4j-api + + +Eclipse Public License (EPL) 1.0 +-------------------------------- + +org.eclipse.collections:eclipse-collections +org.eclipse.collections:eclipse-collections-api + diff --git a/package/src/main/resources/META-INF/NOTICE b/package/src/main/resources/META-INF/NOTICE new file mode 100644 index 0000000000000..be510d300f22d --- /dev/null +++ b/package/src/main/resources/META-INF/NOTICE @@ -0,0 +1,1722 @@ +Apache Gluten(incubating) +Copyright 2023-2024 The Apache Software Foundation + +This product includes software developed at +The Apache Software Foundation (http://www.apache.org/). + +## Third-party Content + +This project leverages the following third party content. + +Apache Spark +Copyright 2014 and onwards The Apache Software Foundation. + +--------------------------------------------------------- + +Apache Celeborn +Copyright 2022-2024 The Apache Software Foundation. + +--------------------------------------------------------- + +Apache Uniffle (incubating) +Copyright 2022 and onwards The Apache Software Foundation. + +--------------------------------------------------------- + +Apache Iceberg +Copyright 2017-2024 The Apache Software Foundation. + +--------------------------------------------------------- + +Apache Parquet MR +Copyright 2014-2024 The Apache Software Foundation. + +--------------------------------------------------------- + +Apache ORC +Copyright 2013 and onwards The Apache Software Foundation. + +--------------------------------------------------------- + +Apache Thrift +Copyright (C) 2006 - 2019, The Apache Software Foundation. + +--------------------------------------------------------- + +This project includes code from Daniel Lemire's FrameOfReference project. + +https://github.com/lemire/FrameOfReference/blob/6ccaf9e97160f9a3b299e23a8ef739e711ef0c71/src/bpacking.cpp + +Copyright: 2013 Daniel Lemire +Home page: http://lemire.me/en/ +Project page: https://github.com/lemire/FrameOfReference +License: Apache License Version 2.0 http://www.apache.org/licenses/LICENSE-2.0 + +--------------------------------------------------------- + +This project includes code from the TensorFlow project + +Copyright 2015 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + +--------------------------------------------------------- + +This project includes code from the NumPy project. + +https://github.com/numpy/numpy/blob/e1f191c46f2eebd6cb892a4bfe14d9dd43a06c4e/numpy/core/src/multiarray/multiarraymodule.c#L2910 + +https://github.com/numpy/numpy/blob/68fd82271b9ea5a9e50d4e761061dfcca851382a/numpy/core/src/multiarray/datetime.c + +Copyright (c) 2005-2017, NumPy Developers. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above + copyright notice, this list of conditions and the following + disclaimer in the documentation and/or other materials provided + with the distribution. + + * Neither the name of the NumPy Developers nor the names of any + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +--------------------------------------------------------- + +This project includes code from the Boost project + +Boost Software License - Version 1.0 - August 17th, 2003 + +Permission is hereby granted, free of charge, to any person or organization +obtaining a copy of the software and accompanying documentation covered by +this license (the "Software") to use, reproduce, display, distribute, +execute, and transmit the Software, and to prepare derivative works of the +Software, and to permit third-parties to whom the Software is furnished to +do so, all subject to the following: + +The copyright notices in the Software and this entire statement, including +the above license grant, this restriction and the following disclaimer, +must be included in all copies of the Software, in whole or in part, and +all derivative works of the Software, unless such copies or derivative +works are solely in the form of machine-executable object code generated by +a source language processor. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT +SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE +FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, +ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. + +--------------------------------------------------------- + +This project includes code from the FlatBuffers project + +Copyright 2014 Google Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + +--------------------------------------------------------- + +This project includes code from the tslib project + +Copyright 2015 Microsoft Corporation. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + +--------------------------------------------------------- + +This project includes code from the jemalloc project + +https://github.com/jemalloc/jemalloc + +Copyright (C) 2002-2017 Jason Evans . +All rights reserved. +Copyright (C) 2007-2012 Mozilla Foundation. All rights reserved. +Copyright (C) 2009-2017 Facebook, Inc. All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: +1. Redistributions of source code must retain the above copyright notice(s), + this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright notice(s), + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER(S) ``AS IS'' AND ANY EXPRESS +OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO +EVENT SHALL THE COPYRIGHT HOLDER(S) BE LIABLE FOR ANY DIRECT, INDIRECT, +INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE +OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF +ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +--------------------------------------------------------- + +This project includes code from the Go project, BSD 3-clause license + PATENTS +weak patent termination clause +(https://github.com/golang/go/blob/master/PATENTS). + +Copyright (c) 2009 The Go Authors. All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + * Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above +copyright notice, this list of conditions and the following disclaimer +in the documentation and/or other materials provided with the +distribution. + * Neither the name of Google Inc. nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +--------------------------------------------------------- + +This project includes code from the hs2client + +https://github.com/cloudera/hs2client + +Copyright 2016 Cloudera Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + +-------------------------------------------------------------------------------- + +The script ci/scripts/util_wait_for_it.sh has the following license + +Copyright (c) 2016 Giles Hall + +Permission is hereby granted, free of charge, to any person obtaining a copy of +this software and associated documentation files (the "Software"), to deal in +the Software without restriction, including without limitation the rights to +use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies +of the Software, and to permit persons to whom the Software is furnished to do +so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + +-------------------------------------------------------------------------------- + +The script r/configure has the following license (MIT) + +Copyright (c) 2017, Jeroen Ooms and Jim Hester + +Permission is hereby granted, free of charge, to any person obtaining a copy of +this software and associated documentation files (the "Software"), to deal in +the Software without restriction, including without limitation the rights to +use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies +of the Software, and to permit persons to whom the Software is furnished to do +so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + +-------------------------------------------------------------------------------- + +cpp/src/arrow/util/logging.cc, cpp/src/arrow/util/logging.h and +cpp/src/arrow/util/logging-test.cc are adapted from +Ray Project (https://github.com/ray-project/ray) (Apache 2.0). + +Copyright (c) 2016 Ray Project (https://github.com/ray-project/ray) + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + +-------------------------------------------------------------------------------- +The files cpp/src/arrow/vendored/datetime/date.h, cpp/src/arrow/vendored/datetime/tz.h, +cpp/src/arrow/vendored/datetime/tz_private.h, cpp/src/arrow/vendored/datetime/ios.h, +cpp/src/arrow/vendored/datetime/tz.cpp are adapted from +Howard Hinnant's date library (https://github.com/HowardHinnant/date) +It is licensed under MIT license. + +The MIT License (MIT) +Copyright (c) 2015, 2016, 2017 Howard Hinnant +Copyright (c) 2016 Adrian Colomitchi +Copyright (c) 2017 Florian Dang +Copyright (c) 2017 Paul Thompson +Copyright (c) 2018 Tomasz Kamiński + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + +-------------------------------------------------------------------------------- + +The file cpp/src/arrow/util/utf8.h includes code adapted from the page + https://bjoern.hoehrmann.de/utf-8/decoder/dfa/ +with the following license (MIT) + +Copyright (c) 2008-2009 Bjoern Hoehrmann + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + +-------------------------------------------------------------------------------- + +The file cpp/src/arrow/vendored/string_view.hpp has the following license + +Boost Software License - Version 1.0 - August 17th, 2003 + +Permission is hereby granted, free of charge, to any person or organization +obtaining a copy of the software and accompanying documentation covered by +this license (the "Software") to use, reproduce, display, distribute, +execute, and transmit the Software, and to prepare derivative works of the +Software, and to permit third-parties to whom the Software is furnished to +do so, all subject to the following: + +The copyright notices in the Software and this entire statement, including +the above license grant, this restriction and the following disclaimer, +must be included in all copies of the Software, in whole or in part, and +all derivative works of the Software, unless such copies or derivative +works are solely in the form of machine-executable object code generated by +a source language processor. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT +SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE +FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, +ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. + +-------------------------------------------------------------------------------- + +The file cpp/src/arrow/vendored/variant.hpp has the following license + +Boost Software License - Version 1.0 - August 17th, 2003 + +Permission is hereby granted, free of charge, to any person or organization +obtaining a copy of the software and accompanying documentation covered by +this license (the "Software") to use, reproduce, display, distribute, +execute, and transmit the Software, and to prepare derivative works of the +Software, and to permit third-parties to whom the Software is furnished to +do so, all subject to the following: + +The copyright notices in the Software and this entire statement, including +the above license grant, this restriction and the following disclaimer, +must be included in all copies of the Software, in whole or in part, and +all derivative works of the Software, unless such copies or derivative +works are solely in the form of machine-executable object code generated by +a source language processor. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT +SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE +FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, +ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. + +-------------------------------------------------------------------------------- + +The files in cpp/src/arrow/vendored/xxhash/ have the following license +(BSD 2-Clause License) + +xxHash Library +Copyright (c) 2012-2014, Yann Collet +All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, +are permitted provided that the following conditions are met: + +* Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + +* Redistributions in binary form must reproduce the above copyright notice, this + list of conditions and the following disclaimer in the documentation and/or + other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR +ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON +ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +You can contact the author at : +- xxHash homepage: http://www.xxhash.com +- xxHash source repository : https://github.com/Cyan4973/xxHash + +-------------------------------------------------------------------------------- + +The files in cpp/src/arrow/vendored/double-conversion/ have the following license +(BSD 3-Clause License) + +Copyright 2006-2011, the V8 project authors. All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above + copyright notice, this list of conditions and the following + disclaimer in the documentation and/or other materials provided + with the distribution. + * Neither the name of Google Inc. nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +-------------------------------------------------------------------------------- + +The files in cpp/src/arrow/vendored/uriparser/ have the following license +(BSD 3-Clause License) + +uriparser - RFC 3986 URI parsing library + +Copyright (C) 2007, Weijia Song +Copyright (C) 2007, Sebastian Pipping +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + + * Redistributions of source code must retain the above + copyright notice, this list of conditions and the following + disclaimer. + + * Redistributions in binary form must reproduce the above + copyright notice, this list of conditions and the following + disclaimer in the documentation and/or other materials + provided with the distribution. + + * Neither the name of the nor the names of its + contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, +STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED +OF THE POSSIBILITY OF SUCH DAMAGE. + +-------------------------------------------------------------------------------- + +The files under dev/tasks/conda-recipes have the following license + +BSD 3-clause license +Copyright (c) 2015-2018, conda-forge +All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, +are permitted provided that the following conditions are met: + +1. Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its contributors + may be used to endorse or promote products derived from this software without + specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR +TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF +THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +-------------------------------------------------------------------------------- + +The files in cpp/src/arrow/vendored/utf8cpp/ have the following license + +Copyright 2006 Nemanja Trifunovic + +Permission is hereby granted, free of charge, to any person or organization +obtaining a copy of the software and accompanying documentation covered by +this license (the "Software") to use, reproduce, display, distribute, +execute, and transmit the Software, and to prepare derivative works of the +Software, and to permit third-parties to whom the Software is furnished to +do so, all subject to the following: + +The copyright notices in the Software and this entire statement, including +the above license grant, this restriction and the following disclaimer, +must be included in all copies of the Software, in whole or in part, and +all derivative works of the Software, unless such copies or derivative +works are solely in the form of machine-executable object code generated by +a source language processor. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT +SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE +FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, +ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. + +-------------------------------------------------------------------------------- + +This project includes code from Apache Kudu. + + * cpp/cmake_modules/CompilerInfo.cmake is based on Kudu's cmake_modules/CompilerInfo.cmake + +Copyright: 2016 The Apache Software Foundation. +Home page: https://kudu.apache.org/ +License: http://www.apache.org/licenses/LICENSE-2.0 + +-------------------------------------------------------------------------------- + +This project includes code from Apache Impala (incubating), formerly +Impala. The Impala code and rights were donated to the ASF as part of the +Incubator process after the initial code imports into Apache Parquet. + +Copyright: 2012 Cloudera, Inc. +Copyright: 2016 The Apache Software Foundation. +Home page: http://impala.apache.org/ +License: http://www.apache.org/licenses/LICENSE-2.0 + +-------------------------------------------------------------------------------- + +This project includes code from Apache Aurora. + +* dev/release/{release,changelog,release-candidate} are based on the scripts from + Apache Aurora + +Copyright: 2016 The Apache Software Foundation. +Home page: https://aurora.apache.org/ +License: http://www.apache.org/licenses/LICENSE-2.0 + +-------------------------------------------------------------------------------- + +This project includes code from the Google styleguide. + +* cpp/build-support/cpplint.py is based on the scripts from the Google styleguide. + +Copyright: 2009 Google Inc. All rights reserved. +Homepage: https://github.com/google/styleguide +License: 3-clause BSD + +-------------------------------------------------------------------------------- + +This project includes code from Snappy. + +* cpp/cmake_modules/{SnappyCMakeLists.txt,SnappyConfig.h} are based on code + from Google's Snappy project. + +Copyright: 2009 Google Inc. All rights reserved. +Homepage: https://github.com/google/snappy +License: 3-clause BSD + +-------------------------------------------------------------------------------- + +This project includes code from the manylinux project. + +* python/manylinux1/scripts/{build_python.sh,python-tag-abi-tag.py, + requirements.txt} are based on code from the manylinux project. + +Copyright: 2016 manylinux +Homepage: https://github.com/pypa/manylinux +License: The MIT License (MIT) + +-------------------------------------------------------------------------------- + +This project includes code from the cymove project: + +* python/pyarrow/includes/common.pxd includes code from the cymove project + +The MIT License (MIT) +Copyright (c) 2019 Omer Ozarslan + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, +DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR +OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE +OR OTHER DEALINGS IN THE SOFTWARE. + +-------------------------------------------------------------------------------- + +The projects includes code from the Ursabot project under the dev/archery +directory. + +License: BSD 2-Clause + +Copyright 2019 RStudio, Inc. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +1. Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +-------------------------------------------------------------------------------- + +This project include code from CMake. + +* cpp/cmake_modules/FindGTest.cmake is based on code from CMake. + +Copyright: Copyright 2000-2019 Kitware, Inc. and Contributors +Homepage: https://gitlab.kitware.com/cmake/cmake +License: 3-clause BSD + +-------------------------------------------------------------------------------- + +This project include code from mingw-w64. + +* cpp/src/arrow/util/cpu-info.cc has a polyfill for mingw-w64 < 5 + +Copyright (c) 2009 - 2013 by the mingw-w64 project +Homepage: https://mingw-w64.org +License: Zope Public License (ZPL) Version 2.1. + +--------------------------------------------------------------------------------- + +This project include code from Google's Asylo project. + +* cpp/src/arrow/result.h is based on status_or.h + +Copyright (c) Copyright 2017 Asylo authors +Homepage: https://asylo.dev/ +License: Apache 2.0 + +-------------------------------------------------------------------------------- + +This project includes code from Google's protobuf project + +* cpp/src/arrow/result.h ARROW_ASSIGN_OR_RAISE is based off ASSIGN_OR_RETURN + +Copyright 2008 Google Inc. All rights reserved. +Homepage: https://developers.google.com/protocol-buffers/ +License: + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + * Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above +copyright notice, this list of conditions and the following disclaimer +in the documentation and/or other materials provided with the +distribution. + * Neither the name of Google Inc. nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +Code generated by the Protocol Buffer compiler is owned by the owner +of the input file used when generating it. This code is not +standalone and requires a support library to be linked with it. This +support library is itself covered by the above license. + +-------------------------------------------------------------------------------- + +3rdparty dependency LLVM is statically linked in certain binary +distributions. LLVM has the following license: + +============================================================================== +LLVM Release License +============================================================================== +University of Illinois/NCSA +Open Source License + +Copyright (c) 2003-2018 University of Illinois at Urbana-Champaign. +All rights reserved. + +Developed by: + + LLVM Team + + University of Illinois at Urbana-Champaign + + http://llvm.org + +Permission is hereby granted, free of charge, to any person obtaining a copy of +this software and associated documentation files (the "Software"), to deal with +the Software without restriction, including without limitation the rights to +use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies +of the Software, and to permit persons to whom the Software is furnished to do +so, subject to the following conditions: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimers. + + * Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimers in the + documentation and/or other materials provided with the distribution. + + * Neither the names of the LLVM Team, University of Illinois at + Urbana-Champaign, nor the names of its contributors may be used to + endorse or promote products derived from this Software without specific + prior written permission. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +CONTRIBUTORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE +SOFTWARE. + +============================================================================== +Copyrights and Licenses for Third Party Software Distributed with LLVM: +============================================================================== +The LLVM software contains code written by third parties. Such software will +have its own individual LICENSE.TXT file in the directory in which it appears. +This file will describe the copyrights, license, and restrictions which apply +to that code. + +The disclaimer of warranty in the University of Illinois Open Source License +applies to all code in the LLVM Distribution, and nothing in any of the +other licenses gives permission to use the names of the LLVM Team or the +University of Illinois to endorse or promote products derived from this +Software. + +The following pieces of software have additional or alternate copyrights, +licenses, and/or restrictions: + +Program Directory +------- --------- +Google Test llvm/utils/unittest/googletest +OpenBSD regex llvm/lib/Support/{reg*, COPYRIGHT.regex} +pyyaml tests llvm/test/YAMLParser/{*.data, LICENSE.TXT} +ARM contributions llvm/lib/Target/ARM/LICENSE.TXT +md5 contributions llvm/lib/Support/MD5.cpp llvm/include/llvm/Support/MD5.h + +-------------------------------------------------------------------------------- + +3rdparty dependency gRPC is statically linked in certain binary +distributions, like the python wheels. gRPC has the following license: + +Copyright 2014 gRPC authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + +-------------------------------------------------------------------------------- + +3rdparty dependency Apache Thrift is statically linked in certain binary +distributions, like the python wheels. Apache Thrift has the following license: + +Apache Thrift +Copyright (C) 2006 - 2019, The Apache Software Foundation + +This product includes software developed at +The Apache Software Foundation (http://www.apache.org/). + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + +-------------------------------------------------------------------------------- + +3rdparty dependency Apache ORC is statically linked in certain binary +distributions, like the python wheels. Apache ORC has the following license: + +Apache ORC +Copyright 2013-2019 The Apache Software Foundation + +This product includes software developed by The Apache Software +Foundation (http://www.apache.org/). + +This product includes software developed by Hewlett-Packard: +(c) Copyright [2014-2015] Hewlett-Packard Development Company, L.P + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + +-------------------------------------------------------------------------------- + +3rdparty dependency zstd is statically linked in certain binary +distributions, like the python wheels. ZSTD has the following license: + +BSD License + +For Zstandard software + +Copyright (c) 2016-present, Facebook, Inc. All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, +are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + + * Neither the name Facebook nor the names of its contributors may be used to + endorse or promote products derived from this software without specific + prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR +ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON +ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +-------------------------------------------------------------------------------- + +3rdparty dependency lz4 is statically linked in certain binary +distributions, like the python wheels. lz4 has the following license: + +LZ4 Library +Copyright (c) 2011-2016, Yann Collet +All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, +are permitted provided that the following conditions are met: + +* Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + +* Redistributions in binary form must reproduce the above copyright notice, this + list of conditions and the following disclaimer in the documentation and/or + other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR +ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON +ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +-------------------------------------------------------------------------------- + +3rdparty dependency Brotli is statically linked in certain binary +distributions, like the python wheels. Brotli has the following license: + +Copyright (c) 2009, 2010, 2013-2016 by the Brotli Authors. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. + +-------------------------------------------------------------------------------- + +3rdparty dependency rapidjson is statically linked in certain binary +distributions, like the python wheels. rapidjson and its dependencies have the +following licenses: + +Tencent is pleased to support the open source community by making RapidJSON +available. + +Copyright (C) 2015 THL A29 Limited, a Tencent company, and Milo Yip. +All rights reserved. + +If you have downloaded a copy of the RapidJSON binary from Tencent, please note +that the RapidJSON binary is licensed under the MIT License. +If you have downloaded a copy of the RapidJSON source code from Tencent, please +note that RapidJSON source code is licensed under the MIT License, except for +the third-party components listed below which are subject to different license +terms. Your integration of RapidJSON into your own projects may require +compliance with the MIT License, as well as the other licenses applicable to +the third-party components included within RapidJSON. To avoid the problematic +JSON license in your own projects, it's sufficient to exclude the +bin/jsonchecker/ directory, as it's the only code under the JSON license. +A copy of the MIT License is included in this file. + +Other dependencies and licenses: + + Open Source Software Licensed Under the BSD License: + -------------------------------------------------------------------- + + The msinttypes r29 + Copyright (c) 2006-2013 Alexander Chemeris + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + * Neither the name of copyright holder nor the names of its contributors + may be used to endorse or promote products derived from this software + without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND ANY + EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + DISCLAIMED. IN NO EVENT SHALL THE REGENTS AND CONTRIBUTORS BE LIABLE FOR + ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH + DAMAGE. + + Open Source Software Licensed Under the JSON License: + -------------------------------------------------------------------- + + json.org + Copyright (c) 2002 JSON.org + All Rights Reserved. + + JSON_checker + Copyright (c) 2002 JSON.org + All Rights Reserved. + + + Terms of the JSON License: + --------------------------------------------------- + + Permission is hereby granted, free of charge, to any person obtaining a + copy of this software and associated documentation files (the "Software"), + to deal in the Software without restriction, including without limitation + the rights to use, copy, modify, merge, publish, distribute, sublicense, + and/or sell copies of the Software, and to permit persons to whom the + Software is furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + The Software shall be used for Good, not Evil. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + DEALINGS IN THE SOFTWARE. + + + Terms of the MIT License: + -------------------------------------------------------------------- + + Permission is hereby granted, free of charge, to any person obtaining a + copy of this software and associated documentation files (the "Software"), + to deal in the Software without restriction, including without limitation + the rights to use, copy, modify, merge, publish, distribute, sublicense, + and/or sell copies of the Software, and to permit persons to whom the + Software is furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included + in all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + DEALINGS IN THE SOFTWARE. + +-------------------------------------------------------------------------------- + +3rdparty dependency snappy is statically linked in certain binary +distributions, like the python wheels. snappy has the following license: + +Copyright 2011, Google Inc. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + * Neither the name of Google Inc. nor the names of its contributors may be + used to endorse or promote products derived from this software without + specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +=== + +Some of the benchmark data in testdata/ is licensed differently: + + - fireworks.jpeg is Copyright 2013 Steinar H. Gunderson, and + is licensed under the Creative Commons Attribution 3.0 license + (CC-BY-3.0). See https://creativecommons.org/licenses/by/3.0/ + for more information. + + - kppkn.gtb is taken from the Gaviota chess tablebase set, and + is licensed under the MIT License. See + https://sites.google.com/site/gaviotachessengine/Home/endgame-tablebases-1 + for more information. + + - paper-100k.pdf is an excerpt (bytes 92160 to 194560) from the paper + “Combinatorial Modeling of Chromatin Features Quantitatively Predicts DNA + Replication Timing in _Drosophila_” by Federico Comoglio and Renato Paro, + which is licensed under the CC-BY license. See + http://www.ploscompbiol.org/static/license for more ifnormation. + + - alice29.txt, asyoulik.txt, plrabn12.txt and lcet10.txt are from Project + Gutenberg. The first three have expired copyrights and are in the public + domain; the latter does not have expired copyright, but is still in the + public domain according to the license information + (http://www.gutenberg.org/ebooks/53). + +-------------------------------------------------------------------------------- + +3rdparty dependency gflags is statically linked in certain binary +distributions, like the python wheels. gflags has the following license: + +Copyright (c) 2006, Google Inc. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + * Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above +copyright notice, this list of conditions and the following disclaimer +in the documentation and/or other materials provided with the +distribution. + * Neither the name of Google Inc. nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +-------------------------------------------------------------------------------- + +3rdparty dependency glog is statically linked in certain binary +distributions, like the python wheels. glog has the following license: + +Copyright (c) 2008, Google Inc. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + * Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above +copyright notice, this list of conditions and the following disclaimer +in the documentation and/or other materials provided with the +distribution. + * Neither the name of Google Inc. nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +A function gettimeofday in utilities.cc is based on + +http://www.google.com/codesearch/p?hl=en#dR3YEbitojA/COPYING&q=GetSystemTimeAsFileTime%20license:bsd + +The license of this code is: + +Copyright (c) 2003-2008, Jouni Malinen and contributors +All Rights Reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + +1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + +3. Neither the name(s) of the above-listed copyright holder(s) nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +-------------------------------------------------------------------------------- + +3rdparty dependency re2 is statically linked in certain binary +distributions, like the python wheels. re2 has the following license: + +Copyright (c) 2009 The RE2 Authors. All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above + copyright notice, this list of conditions and the following + disclaimer in the documentation and/or other materials provided + with the distribution. + * Neither the name of Google Inc. nor the names of its contributors + may be used to endorse or promote products derived from this + software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +-------------------------------------------------------------------------------- + +3rdparty dependency c-ares is statically linked in certain binary +distributions, like the python wheels. c-ares has the following license: + +# c-ares license + +Copyright (c) 2007 - 2018, Daniel Stenberg with many contributors, see AUTHORS +file. + +Copyright 1998 by the Massachusetts Institute of Technology. + +Permission to use, copy, modify, and distribute this software and its +documentation for any purpose and without fee is hereby granted, provided that +the above copyright notice appear in all copies and that both that copyright +notice and this permission notice appear in supporting documentation, and that +the name of M.I.T. not be used in advertising or publicity pertaining to +distribution of the software without specific, written prior permission. +M.I.T. makes no representations about the suitability of this software for any +purpose. It is provided "as is" without express or implied warranty. + +-------------------------------------------------------------------------------- + +3rdparty dependency zlib is redistributed as a dynamically linked shared +library in certain binary distributions, like the python wheels. In the future +this will likely change to static linkage. zlib has the following license: + +zlib.h -- interface of the 'zlib' general purpose compression library + version 1.2.11, January 15th, 2017 + + Copyright (C) 1995-2017 Jean-loup Gailly and Mark Adler + + This software is provided 'as-is', without any express or implied + warranty. In no event will the authors be held liable for any damages + arising from the use of this software. + + Permission is granted to anyone to use this software for any purpose, + including commercial applications, and to alter it and redistribute it + freely, subject to the following restrictions: + + 1. The origin of this software must not be misrepresented; you must not + claim that you wrote the original software. If you use this software + in a product, an acknowledgment in the product documentation would be + appreciated but is not required. + 2. Altered source versions must be plainly marked as such, and must not be + misrepresented as being the original software. + 3. This notice may not be removed or altered from any source distribution. + + Jean-loup Gailly Mark Adler + jloup@gzip.org madler@alumni.caltech.edu + +-------------------------------------------------------------------------------- + +3rdparty dependency openssl is redistributed as a dynamically linked shared +library in certain binary distributions, like the python wheels. openssl +preceding version 3 has the following license: + + LICENSE ISSUES + ============== + + The OpenSSL toolkit stays under a double license, i.e. both the conditions of + the OpenSSL License and the original SSLeay license apply to the toolkit. + See below for the actual license texts. + + OpenSSL License + --------------- + +/* ==================================================================== + * Copyright (c) 1998-2019 The OpenSSL Project. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * 3. All advertising materials mentioning features or use of this + * software must display the following acknowledgment: + * "This product includes software developed by the OpenSSL Project + * for use in the OpenSSL Toolkit. (http://www.openssl.org/)" + * + * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to + * endorse or promote products derived from this software without + * prior written permission. For written permission, please contact + * openssl-core@openssl.org. + * + * 5. Products derived from this software may not be called "OpenSSL" + * nor may "OpenSSL" appear in their names without prior written + * permission of the OpenSSL Project. + * + * 6. Redistributions of any form whatsoever must retain the following + * acknowledgment: + * "This product includes software developed by the OpenSSL Project + * for use in the OpenSSL Toolkit (http://www.openssl.org/)" + * + * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY + * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR + * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED + * OF THE POSSIBILITY OF SUCH DAMAGE. + * ==================================================================== + * + * This product includes cryptographic software written by Eric Young + * (eay@cryptsoft.com). This product includes software written by Tim + * Hudson (tjh@cryptsoft.com). + * + */ + + Original SSLeay License + ----------------------- + +/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) + * All rights reserved. + * + * This package is an SSL implementation written + * by Eric Young (eay@cryptsoft.com). + * The implementation was written so as to conform with Netscapes SSL. + * + * This library is free for commercial and non-commercial use as long as + * the following conditions are aheared to. The following conditions + * apply to all code found in this distribution, be it the RC4, RSA, + * lhash, DES, etc., code; not just the SSL code. The SSL documentation + * included with this distribution is covered by the same copyright terms + * except that the holder is Tim Hudson (tjh@cryptsoft.com). + * + * Copyright remains Eric Young's, and as such any Copyright notices in + * the code are not to be removed. + * If this package is used in a product, Eric Young should be given attribution + * as the author of the parts of the library used. + * This can be in the form of a textual message at program startup or + * in documentation (online or textual) provided with the package. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * "This product includes cryptographic software written by + * Eric Young (eay@cryptsoft.com)" + * The word 'cryptographic' can be left out if the rouines from the library + * being used are not cryptographic related :-). + * 4. If you include any Windows specific code (or a derivative thereof) from + * the apps directory (application code) you must include an acknowledgement: + * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" + * + * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * The licence and distribution terms for any publically available version or + * derivative of this code cannot be changed. i.e. this code cannot simply be + * copied and put under another distribution licence + * [including the GNU Public Licence.] + */ + +-------------------------------------------------------------------------------- + +This project includes code from the rtools-backports project. + +* ci/scripts/PKGBUILD and ci/scripts/r_windows_build.sh are based on code + from the rtools-backports project. + +Copyright: Copyright (c) 2013 - 2019, Алексей and Jeroen Ooms. +All rights reserved. +Homepage: https://github.com/r-windows/rtools-backports +License: 3-clause BSD + +-------------------------------------------------------------------------------- + +Some code from pandas has been adapted for the pyarrow codebase. pandas is +available under the 3-clause BSD license, which follows: + +pandas license +============== + +Copyright (c) 2011-2012, Lambda Foundry, Inc. and PyData Development Team +All rights reserved. + +Copyright (c) 2008-2011 AQR Capital Management, LLC +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above + copyright notice, this list of conditions and the following + disclaimer in the documentation and/or other materials provided + with the distribution. + + * Neither the name of the copyright holder nor the names of any + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +-------------------------------------------------------------------------------- + +Some bits from DyND, in particular aspects of the build system, have been +adapted from libdynd and dynd-python under the terms of the BSD 2-clause +license + +The BSD 2-Clause License + + Copyright (C) 2011-12, Dynamic NDArray Developers + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above + copyright notice, this list of conditions and the following + disclaimer in the documentation and/or other materials provided + with the distribution. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +Dynamic NDArray Developers list: + + * Mark Wiebe + * Continuum Analytics + +-------------------------------------------------------------------------------- + +Some source code from Ibis (https://github.com/cloudera/ibis) has been adapted +for PyArrow. Ibis is released under the Apache License, Version 2.0. + +-------------------------------------------------------------------------------- + +This project includes code from the autobrew project. + +* r/tools/autobrew and dev/tasks/homebrew-formulae/autobrew/apache-arrow.rb + are based on code from the autobrew project. + +Copyright (c) 2019, Jeroen Ooms +License: MIT +Homepage: https://github.com/jeroen/autobrew + +-------------------------------------------------------------------------------- + +dev/tasks/homebrew-formulae/apache-arrow.rb has the following license: + +BSD 2-Clause License + +Copyright (c) 2009-present, Homebrew contributors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +* Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + +* Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +---------------------------------------------------------------------- + +cpp/src/arrow/vendored/base64.cpp has the following license + +ZLIB License + +Copyright (C) 2004-2017 René Nyffenegger + +This source code is provided 'as-is', without any express or implied +warranty. In no event will the author be held liable for any damages arising +from the use of this software. + +Permission is granted to anyone to use this software for any purpose, including +commercial applications, and to alter it and redistribute it freely, subject to +the following restrictions: + +1. The origin of this source code must not be misrepresented; you must not + claim that you wrote the original source code. If you use this source code + in a product, an acknowledgment in the product documentation would be + appreciated but is not required. + +2. Altered source versions must be plainly marked as such, and must not be + misrepresented as being the original source code. + +3. This notice may not be removed or altered from any source distribution. + +René Nyffenegger rene.nyffenegger@adp-gmbh.ch + +-------------------------------------------------------------------------------- + +The file cpp/src/arrow/vendored/optional.hpp has the following license + +Boost Software License - Version 1.0 - August 17th, 2003 + +Permission is hereby granted, free of charge, to any person or organization +obtaining a copy of the software and accompanying documentation covered by +this license (the "Software") to use, reproduce, display, distribute, +execute, and transmit the Software, and to prepare derivative works of the +Software, and to permit third-parties to whom the Software is furnished to +do so, all subject to the following: + +The copyright notices in the Software and this entire statement, including +the above license grant, this restriction and the following disclaimer, +must be included in all copies of the Software, in whole or in part, and +all derivative works of the Software, unless such copies or derivative +works are solely in the form of machine-executable object code generated by +a source language processor. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT +SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE +FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, +ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. diff --git a/pom.xml b/pom.xml index fd44d06b759c7..4fb327f9f7ddd 100644 --- a/pom.xml +++ b/pom.xml @@ -97,7 +97,7 @@ 32.0.1-jre 2.27.2 - 3.5.9 + 3.8.3 package /* @@ -147,6 +147,7 @@ 2.13.8 2.13 + 3.8.3 diff --git a/shims/common/src/main/scala/org/apache/gluten/GlutenConfig.scala b/shims/common/src/main/scala/org/apache/gluten/GlutenConfig.scala index ed7a811929940..fa78060dad6c8 100644 --- a/shims/common/src/main/scala/org/apache/gluten/GlutenConfig.scala +++ b/shims/common/src/main/scala/org/apache/gluten/GlutenConfig.scala @@ -116,6 +116,9 @@ class GlutenConfig(conf: SQLConf) extends Logging { def forceParquetTimestampTypeScanFallbackEnabled: Boolean = conf.getConf(VELOX_FORCE_PARQUET_TIMESTAMP_TYPE_SCAN_FALLBACK) + def scanFileSchemeValidationEnabled: Boolean = + conf.getConf(VELOX_SCAN_FILE_SCHEME_VALIDATION_ENABLED) + // whether to use ColumnarShuffleManager def isUseColumnarShuffleManager: Boolean = conf @@ -187,7 +190,6 @@ class GlutenConfig(conf: SQLConf) extends Logging { def columnarShuffleCompressionThreshold: Int = conf.getConf(COLUMNAR_SHUFFLE_COMPRESSION_THRESHOLD) - // FIXME: Not clear: MIN or MAX ? def maxBatchSize: Int = conf.getConf(COLUMNAR_MAX_BATCH_SIZE) def columnarToRowMemThreshold: Long = @@ -329,12 +331,11 @@ class GlutenConfig(conf: SQLConf) extends Logging { def veloxResizeBatchesShuffleInputRange: ResizeRange = { val standardSize = conf.getConf(COLUMNAR_MAX_BATCH_SIZE) - val defaultRange: ResizeRange = - ResizeRange((0.25 * standardSize).toInt.max(1), 4 * standardSize) - conf - .getConf(COLUMNAR_VELOX_RESIZE_BATCHES_SHUFFLE_INPUT_RANGE) - .map(ResizeRange.parse) - .getOrElse(defaultRange) + val defaultMinSize: Int = (0.25 * standardSize).toInt.max(1) + val minSize = conf + .getConf(COLUMNAR_VELOX_RESIZE_BATCHES_SHUFFLE_INPUT_MIN_SIZE) + .getOrElse(defaultMinSize) + ResizeRange(minSize, Int.MaxValue) } def chColumnarShuffleSpillThreshold: Long = { @@ -537,16 +538,17 @@ object GlutenConfig { val GLUTEN_ONHEAP_SIZE_KEY = "spark.executor.memory" val GLUTEN_OFFHEAP_SIZE_KEY = "spark.memory.offHeap.size" val GLUTEN_OFFHEAP_ENABLED = "spark.memory.offHeap.enabled" + val SPARK_REDACTION_REGEX = "spark.redaction.regex" // For Soft Affinity Scheduling - // Enable Soft Affinity Scheduling, defalut value is false + // Enable Soft Affinity Scheduling, default value is false val GLUTEN_SOFT_AFFINITY_ENABLED = "spark.gluten.soft-affinity.enabled" val GLUTEN_SOFT_AFFINITY_ENABLED_DEFAULT_VALUE = false - // Calculate the number of the replcations for scheduling to the target executors per file + // Calculate the number of the replications for scheduling to the target executors per file val GLUTEN_SOFT_AFFINITY_REPLICATIONS_NUM = "spark.gluten.soft-affinity.replications.num" val GLUTEN_SOFT_AFFINITY_REPLICATIONS_NUM_DEFAULT_VALUE = 2 // For on HDFS, if there are already target hosts, - // and then prefer to use the orginal target hosts to schedule + // and then prefer to use the original target hosts to schedule val GLUTEN_SOFT_AFFINITY_MIN_TARGET_HOSTS = "spark.gluten.soft-affinity.min.target-hosts" val GLUTEN_SOFT_AFFINITY_MIN_TARGET_HOSTS_DEFAULT_VALUE = 1 @@ -586,9 +588,6 @@ object GlutenConfig { val GLUTEN_SHUFFLE_WRITER_MERGE_THRESHOLD = "spark.gluten.sql.columnar.shuffle.merge.threshold" - // Columnar to row memory threshold. - val GLUTEN_COLUMNAR_TO_ROW_MEM_THRESHOLD_KEY = "spark.gluten.sql.columnarToRowMemoryThreshold" - // Controls whether to load DLL from jars. User can get dependent native libs packed into a jar // by executing dev/package.sh. Then, with that jar configured, Gluten can load the native libs // at runtime. This config is just for velox backend. And it is NOT applicable to the situation @@ -653,7 +652,6 @@ object GlutenConfig { GLUTEN_SAVE_DIR, GLUTEN_TASK_OFFHEAP_SIZE_IN_BYTES_KEY, GLUTEN_MAX_BATCH_SIZE_KEY, - GLUTEN_COLUMNAR_TO_ROW_MEM_THRESHOLD_KEY, GLUTEN_SHUFFLE_WRITER_BUFFER_SIZE, SQLConf.SESSION_LOCAL_TIMEZONE.key, GLUTEN_DEFAULT_SESSION_TIMEZONE_KEY, @@ -680,7 +678,8 @@ object GlutenConfig { // gcs config SPARK_GCS_STORAGE_ROOT_URL, SPARK_GCS_AUTH_TYPE, - SPARK_GCS_AUTH_SERVICE_ACCOUNT_JSON_KEYFILE + SPARK_GCS_AUTH_SERVICE_ACCOUNT_JSON_KEYFILE, + SPARK_REDACTION_REGEX ) nativeConfMap.putAll(conf.filter(e => keys.contains(e._1)).asJava) @@ -689,7 +688,10 @@ object GlutenConfig { (SQLConf.IGNORE_MISSING_FILES.key, SQLConf.IGNORE_MISSING_FILES.defaultValueString), ( COLUMNAR_MEMORY_BACKTRACE_ALLOCATION.key, - COLUMNAR_MEMORY_BACKTRACE_ALLOCATION.defaultValueString) + COLUMNAR_MEMORY_BACKTRACE_ALLOCATION.defaultValueString), + ( + GLUTEN_COLUMNAR_TO_ROW_MEM_THRESHOLD.key, + GLUTEN_COLUMNAR_TO_ROW_MEM_THRESHOLD.defaultValue.get.toString) ) keyWithDefault.forEach(e => nativeConfMap.put(e._1, conf.getOrElse(e._1, e._2))) @@ -764,7 +766,8 @@ object GlutenConfig { GLUTEN_TASK_OFFHEAP_SIZE_IN_BYTES_KEY, GLUTEN_OFFHEAP_ENABLED, SESSION_LOCAL_TIMEZONE.key, - DECIMAL_OPERATIONS_ALLOW_PREC_LOSS.key + DECIMAL_OPERATIONS_ALLOW_PREC_LOSS.key, + SPARK_REDACTION_REGEX ) nativeConfMap.putAll(conf.filter(e => keys.contains(e._1)).asJava) @@ -1122,7 +1125,7 @@ object GlutenConfig { .createWithDefault(4096) val GLUTEN_COLUMNAR_TO_ROW_MEM_THRESHOLD = - buildConf(GLUTEN_COLUMNAR_TO_ROW_MEM_THRESHOLD_KEY) + buildConf("spark.gluten.sql.columnarToRowMemoryThreshold") .internal() .bytesConf(ByteUnit.BYTE) .createWithDefaultString("64MB") @@ -1319,10 +1322,11 @@ object GlutenConfig { val RAS_COST_MODEL = buildConf("spark.gluten.ras.costModel") .doc( - "Experimental: The class name of user-defined cost model that will be used by RAS. " + - "If not specified, a rough built-in cost model will be used.") + "Experimental: The class name of user-defined cost model that will be used by RAS. If " + + "not specified, a legacy built-in cost model that exhaustively offloads computations " + + "will be used.") .stringConf - .createWithDefaultString("rough") + .createWithDefaultString("legacy") // velox caching options. val COLUMNAR_VELOX_CACHE_ENABLED = @@ -1492,17 +1496,16 @@ object GlutenConfig { .booleanConf .createWithDefault(true) - val COLUMNAR_VELOX_RESIZE_BATCHES_SHUFFLE_INPUT_RANGE = - buildConf("spark.gluten.sql.columnar.backend.velox.resizeBatches.shuffleInput.range") + val COLUMNAR_VELOX_RESIZE_BATCHES_SHUFFLE_INPUT_MIN_SIZE = + buildConf("spark.gluten.sql.columnar.backend.velox.resizeBatches.shuffleInput.minSize") .internal() .doc( - s"The minimum and maximum batch sizes for shuffle. If the batch size is " + - s"smaller / bigger than minimum / maximum value, it will be combined with other " + - s"batches / split before sending to shuffle. Only functions when " + + s"The minimum batch size for shuffle. If size of an input batch is " + + s"smaller than the value, it will be combined with other " + + s"batches before sending to shuffle. Only functions when " + s"${COLUMNAR_VELOX_RESIZE_BATCHES_SHUFFLE_INPUT.key} is set to true. " + - s"A valid value for the option is min~max. " + - s"E.g., s.g.s.c.b.v.resizeBatches.shuffleInput.range=100~10000") - .stringConf + s"Default value: 0.25 * ") + .intConf .createOptional val COLUMNAR_CH_SHUFFLE_SPILL_THRESHOLD = @@ -2011,6 +2014,16 @@ object GlutenConfig { .booleanConf .createWithDefault(false) + val VELOX_SCAN_FILE_SCHEME_VALIDATION_ENABLED = + buildConf("spark.gluten.sql.scan.fileSchemeValidation.enabled") + .internal() + .doc( + "When true, enable file path scheme validation for scan. Validation will fail if" + + " file scheme is not supported by registered file systems, which will cause scan " + + " operator fall back.") + .booleanConf + .createWithDefault(true) + val COLUMNAR_NATIVE_CAST_AGGREGATE_ENABLED = buildConf("spark.gluten.sql.columnar.cast.avg") .internal() diff --git a/shims/common/src/main/scala/org/apache/gluten/expression/ExpressionNames.scala b/shims/common/src/main/scala/org/apache/gluten/expression/ExpressionNames.scala index d47dbc4cc1fa8..96a615615179c 100644 --- a/shims/common/src/main/scala/org/apache/gluten/expression/ExpressionNames.scala +++ b/shims/common/src/main/scala/org/apache/gluten/expression/ExpressionNames.scala @@ -132,6 +132,7 @@ object ExpressionNames { final val UNBASE64 = "unbase64" final val BASE64 = "base64" final val MASK = "mask" + final val FORMAT_STRING = "format_string" // URL functions final val PARSE_URL = "parse_url" diff --git a/shims/common/src/main/scala/org/apache/gluten/metrics/GlutenTimeMetric.scala b/shims/common/src/main/scala/org/apache/gluten/metrics/GlutenTimeMetric.scala index 37e824ea60e49..a2a187a4df315 100644 --- a/shims/common/src/main/scala/org/apache/gluten/metrics/GlutenTimeMetric.scala +++ b/shims/common/src/main/scala/org/apache/gluten/metrics/GlutenTimeMetric.scala @@ -44,4 +44,10 @@ object GlutenTimeMetric { } def withMillisTime[U](block: => U)(millisTime: Long => Unit): U = withNanoTime(block)(t => millisTime(TimeUnit.NANOSECONDS.toMillis(t))) + + def recordMillisTime[U](block: => U): (U, Long) = { + var time = 0L + val result = withMillisTime(block)(time = _) + (result, time) + } } diff --git a/shims/spark32/pom.xml b/shims/spark32/pom.xml index 802b758c8531a..7e9dcb226dbb5 100644 --- a/shims/spark32/pom.xml +++ b/shims/spark32/pom.xml @@ -43,13 +43,13 @@ org.apache.spark - spark-catalyst_2.12 + spark-catalyst_${scala.binary.version} provided true org.apache.spark - spark-core_2.12 + spark-core_${scala.binary.version} provided true diff --git a/shims/spark32/src/main/java/org/apache/spark/memory/MemoryConsumer.java b/shims/spark32/src/main/java/org/apache/spark/memory/MemoryConsumer.java new file mode 100644 index 0000000000000..bfe699c13a354 --- /dev/null +++ b/shims/spark32/src/main/java/org/apache/spark/memory/MemoryConsumer.java @@ -0,0 +1,154 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.memory; + +import org.apache.spark.unsafe.array.LongArray; +import org.apache.spark.unsafe.memory.MemoryBlock; + +import java.io.IOException; + +/** + * A memory consumer of {@link TaskMemoryManager} that supports spilling. + * + *

Note: this only supports allocation / spilling of Tungsten memory. + */ +public abstract class MemoryConsumer { + + protected final TaskMemoryManager taskMemoryManager; + private final long pageSize; + private final MemoryMode mode; + protected long used; + + protected MemoryConsumer(TaskMemoryManager taskMemoryManager, long pageSize, MemoryMode mode) { + this.taskMemoryManager = taskMemoryManager; + this.pageSize = pageSize; + this.mode = mode; + } + + protected MemoryConsumer(TaskMemoryManager taskMemoryManager, MemoryMode mode) { + this(taskMemoryManager, taskMemoryManager.pageSizeBytes(), mode); + } + + public long getTaskAttemptId() { + return this.taskMemoryManager.getTaskAttemptId(); + } + + /** Returns the memory mode, {@link MemoryMode#ON_HEAP} or {@link MemoryMode#OFF_HEAP}. */ + public MemoryMode getMode() { + return mode; + } + + /** Returns the size of used memory in bytes. */ + public long getUsed() { + return used; + } + + /** Force spill during building. */ + public void spill() throws IOException { + spill(Long.MAX_VALUE, this); + } + + /** + * Spill some data to disk to release memory, which will be called by TaskMemoryManager when there + * is not enough memory for the task. + * + *

This should be implemented by subclass. + * + *

Note: In order to avoid possible deadlock, should not call acquireMemory() from spill(). + * + *

Note: today, this only frees Tungsten-managed pages. + * + * @param size the amount of memory should be released + * @param trigger the MemoryConsumer that trigger this spilling + * @return the amount of released memory in bytes + */ + public abstract long spill(long size, MemoryConsumer trigger) throws IOException; + + public long forceSpill(long size, MemoryConsumer trigger) throws IOException { + return 0; + } + + /** + * Allocates a LongArray of `size`. Note that this method may throw `SparkOutOfMemoryError` if + * Spark doesn't have enough memory for this allocation, or throw `TooLargePageException` if this + * `LongArray` is too large to fit in a single page. The caller side should take care of these two + * exceptions, or make sure the `size` is small enough that won't trigger exceptions. + * + * @throws SparkOutOfMemoryError + * @throws TooLargePageException + */ + public LongArray allocateArray(long size) { + long required = size * 8L; + MemoryBlock page = taskMemoryManager.allocatePage(required, this); + if (page == null || page.size() < required) { + throwOom(page, required); + } + used += required; + return new LongArray(page); + } + + /** Frees a LongArray. */ + public void freeArray(LongArray array) { + freePage(array.memoryBlock()); + } + + /** + * Allocate a memory block with at least `required` bytes. + * + * @throws SparkOutOfMemoryError + */ + protected MemoryBlock allocatePage(long required) { + MemoryBlock page = taskMemoryManager.allocatePage(Math.max(pageSize, required), this); + if (page == null || page.size() < required) { + throwOom(page, required); + } + used += page.size(); + return page; + } + + /** Free a memory block. */ + protected void freePage(MemoryBlock page) { + used -= page.size(); + taskMemoryManager.freePage(page, this); + } + + /** Allocates memory of `size`. */ + public long acquireMemory(long size) { + long granted = taskMemoryManager.acquireExecutionMemory(size, this); + used += granted; + return granted; + } + + /** Release N bytes of memory. */ + public void freeMemory(long size) { + taskMemoryManager.releaseExecutionMemory(size, this); + used -= size; + } + + private void throwOom(final MemoryBlock page, final long required) { + long got = 0; + if (page != null) { + got = page.size(); + taskMemoryManager.freePage(page, this); + } + taskMemoryManager.showMemoryUsage(); + // checkstyle.off: RegexpSinglelineJava + throw new SparkOutOfMemoryError( + "UNABLE_TO_ACQUIRE_MEMORY", new String[] {Long.toString(required), Long.toString(got)}); + // checkstyle.on: RegexpSinglelineJava + } +} diff --git a/shims/spark32/src/main/java/org/apache/spark/memory/TaskMemoryManager.java b/shims/spark32/src/main/java/org/apache/spark/memory/TaskMemoryManager.java new file mode 100644 index 0000000000000..57a92110c06a5 --- /dev/null +++ b/shims/spark32/src/main/java/org/apache/spark/memory/TaskMemoryManager.java @@ -0,0 +1,479 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.memory; + +import com.google.common.annotations.VisibleForTesting; +import org.apache.spark.unsafe.memory.MemoryBlock; +import org.apache.spark.util.Utils; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import javax.annotation.concurrent.GuardedBy; + +import java.io.IOException; +import java.nio.channels.ClosedByInterruptException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.BitSet; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.TreeMap; + +/** + * Manages the memory allocated by an individual task. + * + *

Most of the complexity in this class deals with encoding of off-heap addresses into 64-bit + * longs. In off-heap mode, memory can be directly addressed with 64-bit longs. In on-heap mode, + * memory is addressed by the combination of a base Object reference and a 64-bit offset within that + * object. This is a problem when we want to store pointers to data structures inside of other + * structures, such as record pointers inside hashmaps or sorting buffers. Even if we decided to use + * 128 bits to address memory, we can't just store the address of the base object since it's not + * guaranteed to remain stable as the heap gets reorganized due to GC. + * + *

Instead, we use the following approach to encode record pointers in 64-bit longs: for off-heap + * mode, just store the raw address, and for on-heap mode use the upper 13 bits of the address to + * store a "page number" and the lower 51 bits to store an offset within this page. These page + * numbers are used to index into a "page table" array inside of the MemoryManager in order to + * retrieve the base object. + * + *

This allows us to address 8192 pages. In on-heap mode, the maximum page size is limited by the + * maximum size of a long[] array, allowing us to address 8192 * (2^31 - 1) * 8 bytes, which is + * approximately 140 terabytes of memory. + */ +public class TaskMemoryManager { + + private static final Logger logger = LoggerFactory.getLogger(TaskMemoryManager.class); + + /** The number of bits used to address the page table. */ + private static final int PAGE_NUMBER_BITS = 13; + + /** The number of bits used to encode offsets in data pages. */ + @VisibleForTesting static final int OFFSET_BITS = 64 - PAGE_NUMBER_BITS; // 51 + + /** The number of entries in the page table. */ + private static final int PAGE_TABLE_SIZE = 1 << PAGE_NUMBER_BITS; + + /** + * Maximum supported data page size (in bytes). In principle, the maximum addressable page size is + * (1L << OFFSET_BITS) bytes, which is 2+ petabytes. However, the on-heap allocator's + * maximum page size is limited by the maximum amount of data that can be stored in a long[] + * array, which is (2^31 - 1) * 8 bytes (or about 17 gigabytes). Therefore, we cap this at 17 + * gigabytes. + */ + public static final long MAXIMUM_PAGE_SIZE_BYTES = ((1L << 31) - 1) * 8L; + + /** Bit mask for the lower 51 bits of a long. */ + private static final long MASK_LONG_LOWER_51_BITS = 0x7FFFFFFFFFFFFL; + + /** + * Similar to an operating system's page table, this array maps page numbers into base object + * pointers, allowing us to translate between the hashtable's internal 64-bit address + * representation and the baseObject+offset representation which we use to support both on- and + * off-heap addresses. When using an off-heap allocator, every entry in this map will be `null`. + * When using an on-heap allocator, the entries in this map will point to pages' base objects. + * Entries are added to this map as new data pages are allocated. + */ + private final MemoryBlock[] pageTable = new MemoryBlock[PAGE_TABLE_SIZE]; + + /** Bitmap for tracking free pages. */ + private final BitSet allocatedPages = new BitSet(PAGE_TABLE_SIZE); + + private final MemoryManager memoryManager; + + private final long taskAttemptId; + + /** + * Tracks whether we're on-heap or off-heap. For off-heap, we short-circuit most of these methods + * without doing any masking or lookups. Since this branching should be well-predicted by the JIT, + * this extra layer of indirection / abstraction hopefully shouldn't be too expensive. + */ + final MemoryMode tungstenMemoryMode; + + /** Tracks spillable memory consumers. */ + @GuardedBy("this") + private final HashSet consumers; + + /** The amount of memory that is acquired but not used. */ + private volatile long acquiredButNotUsed = 0L; + + /** Construct a new TaskMemoryManager. */ + public TaskMemoryManager(MemoryManager memoryManager, long taskAttemptId) { + this.tungstenMemoryMode = memoryManager.tungstenMemoryMode(); + this.memoryManager = memoryManager; + this.taskAttemptId = taskAttemptId; + this.consumers = new HashSet<>(); + } + + public long getTaskAttemptId() { + return taskAttemptId; + } + + public long acquireExecutionMemory(long required, MemoryConsumer consumer) { + long got = acquireExecutionMemory(required, consumer, false); + if (got < required) { + got += acquireExecutionMemory(required, consumer, true); + } + return got; + } + + /** + * Acquire N bytes of memory for a consumer. If there is no enough memory, it will call spill() of + * consumers to release more memory. + * + * @return number of bytes successfully granted (<= N). + */ + public long acquireExecutionMemory(long required, MemoryConsumer consumer, boolean force) { + assert (required >= 0); + assert (consumer != null); + MemoryMode mode = consumer.getMode(); + // If we are allocating Tungsten pages off-heap and receive a request to allocate on-heap + // memory here, then it may not make sense to spill since that would only end up freeing + // off-heap memory. This is subject to change, though, so it may be risky to make this + // optimization now in case we forget to undo it late when making changes. + synchronized (this) { + long got = memoryManager.acquireExecutionMemory(required, taskAttemptId, mode); + + // Try to release memory from other consumers first, then we can reduce the frequency of + // spilling, avoid to have too many spilled files. + if (got < required) { + // Call spill() on other consumers to release memory + // Sort the consumers according their memory usage. So we avoid spilling the same consumer + // which is just spilled in last few times and re-spilling on it will produce many small + // spill files. + TreeMap> sortedConsumers = new TreeMap<>(); + for (MemoryConsumer c : consumers) { + if (c != consumer && c.getUsed() > 0 && c.getMode() == mode) { + long key = c.getUsed(); + List list = + sortedConsumers.computeIfAbsent(key, k -> new ArrayList<>(1)); + list.add(c); + } + } + while (!sortedConsumers.isEmpty()) { + // Get the consumer using the least memory more than the remaining required memory. + Map.Entry> currentEntry = + sortedConsumers.ceilingEntry(required - got); + // No consumer has used memory more than the remaining required memory. + // Get the consumer of largest used memory. + if (currentEntry == null) { + currentEntry = sortedConsumers.lastEntry(); + } + List cList = currentEntry.getValue(); + MemoryConsumer c = cList.get(cList.size() - 1); + try { + long released = + force ? c.forceSpill(required - got, consumer) : c.spill(required - got, consumer); + if (released > 0) { + logger.debug( + "Task {} released {} from {} for {}", + taskAttemptId, + Utils.bytesToString(released), + c, + consumer); + got += memoryManager.acquireExecutionMemory(required - got, taskAttemptId, mode); + if (got >= required) { + break; + } + } else { + cList.remove(cList.size() - 1); + if (cList.isEmpty()) { + sortedConsumers.remove(currentEntry.getKey()); + } + } + } catch (ClosedByInterruptException e) { + // This called by user to kill a task (e.g: speculative task). + logger.error("error while calling spill() on " + c, e); + throw new RuntimeException(e.getMessage()); + } catch (IOException e) { + logger.error("error while calling spill() on " + c, e); + // checkstyle.off: RegexpSinglelineJava + throw new SparkOutOfMemoryError( + "error while calling spill() on " + c + " : " + e.getMessage()); + // checkstyle.on: RegexpSinglelineJava + } + } + } + + // Attempt to free up memory by self-spilling. + // + // When our spill handler releases memory, `ExecutionMemoryPool#releaseMemory()` will + // immediately notify other tasks that memory has been freed, and they may acquire the + // newly-freed memory before we have a chance to do so (SPARK-35486). In that case, we will + // try again in the next loop iteration. + while (got < required) { + try { + long released = + force + ? consumer.forceSpill(required - got, consumer) + : consumer.spill(required - got, consumer); + if (released > 0) { + logger.debug( + "Task {} released {} from itself ({})", + taskAttemptId, + Utils.bytesToString(released), + consumer); + got += memoryManager.acquireExecutionMemory(required - got, taskAttemptId, mode); + } else { + // Self-spilling could not free up any more memory. + break; + } + } catch (ClosedByInterruptException e) { + // This called by user to kill a task (e.g: speculative task). + logger.error("error while calling spill() on " + consumer, e); + throw new RuntimeException(e.getMessage()); + } catch (IOException e) { + logger.error("error while calling spill() on " + consumer, e); + // checkstyle.off: RegexpSinglelineJava + throw new SparkOutOfMemoryError( + "error while calling spill() on " + consumer + " : " + e.getMessage()); + // checkstyle.on: RegexpSinglelineJava + } + } + + consumers.add(consumer); + logger.debug("Task {} acquired {} for {}", taskAttemptId, Utils.bytesToString(got), consumer); + return got; + } + } + + /** Release N bytes of execution memory for a MemoryConsumer. */ + public void releaseExecutionMemory(long size, MemoryConsumer consumer) { + logger.debug("Task {} release {} from {}", taskAttemptId, Utils.bytesToString(size), consumer); + memoryManager.releaseExecutionMemory(size, taskAttemptId, consumer.getMode()); + } + + /** Dump the memory usage of all consumers. */ + public void showMemoryUsage() { + logger.info("Memory used in task " + taskAttemptId); + synchronized (this) { + long memoryAccountedForByConsumers = 0; + for (MemoryConsumer c : consumers) { + long totalMemUsage = c.getUsed(); + memoryAccountedForByConsumers += totalMemUsage; + if (totalMemUsage > 0) { + logger.info("Acquired by " + c + ": " + Utils.bytesToString(totalMemUsage)); + } + } + long memoryNotAccountedFor = + memoryManager.getExecutionMemoryUsageForTask(taskAttemptId) + - memoryAccountedForByConsumers; + logger.info( + "{} bytes of memory were used by task {} but are not associated with specific consumers", + memoryNotAccountedFor, + taskAttemptId); + logger.info( + "{} bytes of memory are used for execution and {} bytes of memory are used for storage", + memoryManager.executionMemoryUsed(), + memoryManager.storageMemoryUsed()); + } + } + + /** Return the page size in bytes. */ + public long pageSizeBytes() { + return memoryManager.pageSizeBytes(); + } + + /** + * Allocate a block of memory that will be tracked in the MemoryManager's page table; this is + * intended for allocating large blocks of Tungsten memory that will be shared between operators. + * + *

Returns `null` if there was not enough memory to allocate the page. May return a page that + * contains fewer bytes than requested, so callers should verify the size of returned pages. + * + * @throws TooLargePageException + */ + public MemoryBlock allocatePage(long size, MemoryConsumer consumer) { + assert (consumer != null); + assert (consumer.getMode() == tungstenMemoryMode); + if (size > MAXIMUM_PAGE_SIZE_BYTES) { + throw new TooLargePageException(size); + } + + long acquired = acquireExecutionMemory(size, consumer); + if (acquired <= 0) { + return null; + } + + final int pageNumber; + synchronized (this) { + pageNumber = allocatedPages.nextClearBit(0); + if (pageNumber >= PAGE_TABLE_SIZE) { + releaseExecutionMemory(acquired, consumer); + throw new IllegalStateException( + "Have already allocated a maximum of " + PAGE_TABLE_SIZE + " pages"); + } + allocatedPages.set(pageNumber); + } + MemoryBlock page = null; + try { + page = memoryManager.tungstenMemoryAllocator().allocate(acquired); + } catch (OutOfMemoryError e) { + logger.warn("Failed to allocate a page ({} bytes), try again.", acquired); + // there is no enough memory actually, it means the actual free memory is smaller than + // MemoryManager thought, we should keep the acquired memory. + synchronized (this) { + acquiredButNotUsed += acquired; + allocatedPages.clear(pageNumber); + } + // this could trigger spilling to free some pages. + return allocatePage(size, consumer); + } + page.pageNumber = pageNumber; + pageTable[pageNumber] = page; + if (logger.isTraceEnabled()) { + logger.trace("Allocate page number {} ({} bytes)", pageNumber, acquired); + } + return page; + } + + /** Free a block of memory allocated via {@link TaskMemoryManager#allocatePage}. */ + public void freePage(MemoryBlock page, MemoryConsumer consumer) { + assert (page.pageNumber != MemoryBlock.NO_PAGE_NUMBER) + : "Called freePage() on memory that wasn't allocated with allocatePage()"; + assert (page.pageNumber != MemoryBlock.FREED_IN_ALLOCATOR_PAGE_NUMBER) + : "Called freePage() on a memory block that has already been freed"; + assert (page.pageNumber != MemoryBlock.FREED_IN_TMM_PAGE_NUMBER) + : "Called freePage() on a memory block that has already been freed"; + assert (allocatedPages.get(page.pageNumber)); + pageTable[page.pageNumber] = null; + synchronized (this) { + allocatedPages.clear(page.pageNumber); + } + if (logger.isTraceEnabled()) { + logger.trace("Freed page number {} ({} bytes)", page.pageNumber, page.size()); + } + long pageSize = page.size(); + // Clear the page number before passing the block to the MemoryAllocator's free(). + // Doing this allows the MemoryAllocator to detect when a TaskMemoryManager-managed + // page has been inappropriately directly freed without calling TMM.freePage(). + page.pageNumber = MemoryBlock.FREED_IN_TMM_PAGE_NUMBER; + memoryManager.tungstenMemoryAllocator().free(page); + releaseExecutionMemory(pageSize, consumer); + } + + /** + * Given a memory page and offset within that page, encode this address into a 64-bit long. This + * address will remain valid as long as the corresponding page has not been freed. + * + * @param page a data page allocated by {@link TaskMemoryManager#allocatePage}/ + * @param offsetInPage an offset in this page which incorporates the base offset. In other words, + * this should be the value that you would pass as the base offset into an UNSAFE call (e.g. + * page.baseOffset() + something). + * @return an encoded page address. + */ + public long encodePageNumberAndOffset(MemoryBlock page, long offsetInPage) { + if (tungstenMemoryMode == MemoryMode.OFF_HEAP) { + // In off-heap mode, an offset is an absolute address that may require a full 64 bits to + // encode. Due to our page size limitation, though, we can convert this into an offset that's + // relative to the page's base offset; this relative offset will fit in 51 bits. + offsetInPage -= page.getBaseOffset(); + } + return encodePageNumberAndOffset(page.pageNumber, offsetInPage); + } + + @VisibleForTesting + public static long encodePageNumberAndOffset(int pageNumber, long offsetInPage) { + assert (pageNumber >= 0) : "encodePageNumberAndOffset called with invalid page"; + return (((long) pageNumber) << OFFSET_BITS) | (offsetInPage & MASK_LONG_LOWER_51_BITS); + } + + @VisibleForTesting + public static int decodePageNumber(long pagePlusOffsetAddress) { + return (int) (pagePlusOffsetAddress >>> OFFSET_BITS); + } + + private static long decodeOffset(long pagePlusOffsetAddress) { + return (pagePlusOffsetAddress & MASK_LONG_LOWER_51_BITS); + } + + /** + * Get the page associated with an address encoded by {@link + * TaskMemoryManager#encodePageNumberAndOffset(MemoryBlock, long)} + */ + public Object getPage(long pagePlusOffsetAddress) { + if (tungstenMemoryMode == MemoryMode.ON_HEAP) { + final int pageNumber = decodePageNumber(pagePlusOffsetAddress); + assert (pageNumber >= 0 && pageNumber < PAGE_TABLE_SIZE); + final MemoryBlock page = pageTable[pageNumber]; + assert (page != null); + assert (page.getBaseObject() != null); + return page.getBaseObject(); + } else { + return null; + } + } + + /** + * Get the offset associated with an address encoded by {@link + * TaskMemoryManager#encodePageNumberAndOffset(MemoryBlock, long)} + */ + public long getOffsetInPage(long pagePlusOffsetAddress) { + final long offsetInPage = decodeOffset(pagePlusOffsetAddress); + if (tungstenMemoryMode == MemoryMode.ON_HEAP) { + return offsetInPage; + } else { + // In off-heap mode, an offset is an absolute address. In encodePageNumberAndOffset, we + // converted the absolute address into a relative address. Here, we invert that operation: + final int pageNumber = decodePageNumber(pagePlusOffsetAddress); + assert (pageNumber >= 0 && pageNumber < PAGE_TABLE_SIZE); + final MemoryBlock page = pageTable[pageNumber]; + assert (page != null); + return page.getBaseOffset() + offsetInPage; + } + } + + /** + * Clean up all allocated memory and pages. Returns the number of bytes freed. A non-zero return + * value can be used to detect memory leaks. + */ + public long cleanUpAllAllocatedMemory() { + synchronized (this) { + for (MemoryConsumer c : consumers) { + if (c != null && c.getUsed() > 0) { + // In case of failed task, it's normal to see leaked memory + logger.debug("unreleased " + Utils.bytesToString(c.getUsed()) + " memory from " + c); + } + } + consumers.clear(); + + for (MemoryBlock page : pageTable) { + if (page != null) { + logger.debug("unreleased page: " + page + " in task " + taskAttemptId); + page.pageNumber = MemoryBlock.FREED_IN_TMM_PAGE_NUMBER; + memoryManager.tungstenMemoryAllocator().free(page); + } + } + Arrays.fill(pageTable, null); + } + + // release the memory that is not used by any consumer (acquired for pages in tungsten mode). + memoryManager.releaseExecutionMemory(acquiredButNotUsed, taskAttemptId, tungstenMemoryMode); + + return memoryManager.releaseAllExecutionMemoryForTask(taskAttemptId); + } + + /** Returns the memory consumption, in bytes, for the current task. */ + public long getMemoryConsumptionForThisTask() { + return memoryManager.getExecutionMemoryUsageForTask(taskAttemptId); + } + + /** Returns Tungsten memory mode */ + public MemoryMode getTungstenMemoryMode() { + return tungstenMemoryMode; + } +} diff --git a/shims/spark32/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeExternalSorter.java b/shims/spark32/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeExternalSorter.java new file mode 100644 index 0000000000000..0f3bb8cc7d2cc --- /dev/null +++ b/shims/spark32/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeExternalSorter.java @@ -0,0 +1,914 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.util.collection.unsafe.sort; + +import com.google.common.annotations.VisibleForTesting; +import org.apache.spark.TaskContext; +import org.apache.spark.executor.ShuffleWriteMetrics; +import org.apache.spark.memory.MemoryConsumer; +import org.apache.spark.memory.SparkOutOfMemoryError; +import org.apache.spark.memory.TaskMemoryManager; +import org.apache.spark.memory.TooLargePageException; +import org.apache.spark.serializer.SerializerManager; +import org.apache.spark.storage.BlockManager; +import org.apache.spark.unsafe.Platform; +import org.apache.spark.unsafe.UnsafeAlignedOffset; +import org.apache.spark.unsafe.array.LongArray; +import org.apache.spark.unsafe.memory.MemoryBlock; +import org.apache.spark.util.Utils; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import javax.annotation.Nullable; + +import java.io.File; +import java.io.IOException; +import java.util.LinkedList; +import java.util.List; +import java.util.Queue; +import java.util.function.Supplier; + +/** External sorter based on {@link UnsafeInMemorySorter}. */ +public final class UnsafeExternalSorter extends MemoryConsumer { + + private static final Logger logger = LoggerFactory.getLogger(UnsafeExternalSorter.class); + + @Nullable private final PrefixComparator prefixComparator; + + /** + * {@link RecordComparator} may probably keep the reference to the records they compared last + * time, so we should not keep a {@link RecordComparator} instance inside {@link + * UnsafeExternalSorter}, because {@link UnsafeExternalSorter} is referenced by {@link + * TaskContext} and thus can not be garbage collected until the end of the task. + */ + @Nullable private final Supplier recordComparatorSupplier; + + private final TaskMemoryManager taskMemoryManager; + private final BlockManager blockManager; + private final SerializerManager serializerManager; + private final TaskContext taskContext; + + /** The buffer size to use when writing spills using DiskBlockObjectWriter */ + private final int fileBufferSizeBytes; + + /** Force this sorter to spill when there are this many elements in memory. */ + private final int numElementsForSpillThreshold; + + /** + * Memory pages that hold the records being sorted. The pages in this list are freed when + * spilling, although in principle we could recycle these pages across spills (on the other hand, + * this might not be necessary if we maintained a pool of re-usable pages in the TaskMemoryManager + * itself). + */ + private final LinkedList allocatedPages = new LinkedList<>(); + + private final LinkedList spillWriters = new LinkedList<>(); + + // These variables are reset after spilling: + @Nullable private volatile UnsafeInMemorySorter inMemSorter; + + private MemoryBlock currentPage = null; + private long pageCursor = -1; + private long peakMemoryUsedBytes = 0; + private long totalSpillBytes = 0L; + private long totalSortTimeNanos = 0L; + private volatile SpillableIterator readingIterator = null; + + public static UnsafeExternalSorter createWithExistingInMemorySorter( + TaskMemoryManager taskMemoryManager, + BlockManager blockManager, + SerializerManager serializerManager, + TaskContext taskContext, + Supplier recordComparatorSupplier, + PrefixComparator prefixComparator, + int initialSize, + long pageSizeBytes, + int numElementsForSpillThreshold, + UnsafeInMemorySorter inMemorySorter, + long existingMemoryConsumption) + throws IOException { + UnsafeExternalSorter sorter = + new UnsafeExternalSorter( + taskMemoryManager, + blockManager, + serializerManager, + taskContext, + recordComparatorSupplier, + prefixComparator, + initialSize, + pageSizeBytes, + numElementsForSpillThreshold, + inMemorySorter, + false /* ignored */); + sorter.spill(Long.MAX_VALUE, sorter); + taskContext.taskMetrics().incMemoryBytesSpilled(existingMemoryConsumption); + sorter.totalSpillBytes += existingMemoryConsumption; + // The external sorter will be used to insert records, in-memory sorter is not needed. + sorter.inMemSorter = null; + return sorter; + } + + public static UnsafeExternalSorter create( + TaskMemoryManager taskMemoryManager, + BlockManager blockManager, + SerializerManager serializerManager, + TaskContext taskContext, + Supplier recordComparatorSupplier, + PrefixComparator prefixComparator, + int initialSize, + long pageSizeBytes, + int numElementsForSpillThreshold, + boolean canUseRadixSort) { + return new UnsafeExternalSorter( + taskMemoryManager, + blockManager, + serializerManager, + taskContext, + recordComparatorSupplier, + prefixComparator, + initialSize, + pageSizeBytes, + numElementsForSpillThreshold, + null, + canUseRadixSort); + } + + private UnsafeExternalSorter( + TaskMemoryManager taskMemoryManager, + BlockManager blockManager, + SerializerManager serializerManager, + TaskContext taskContext, + Supplier recordComparatorSupplier, + PrefixComparator prefixComparator, + int initialSize, + long pageSizeBytes, + int numElementsForSpillThreshold, + @Nullable UnsafeInMemorySorter existingInMemorySorter, + boolean canUseRadixSort) { + super(taskMemoryManager, pageSizeBytes, taskMemoryManager.getTungstenMemoryMode()); + this.taskMemoryManager = taskMemoryManager; + this.blockManager = blockManager; + this.serializerManager = serializerManager; + this.taskContext = taskContext; + this.recordComparatorSupplier = recordComparatorSupplier; + this.prefixComparator = prefixComparator; + // Use getSizeAsKb (not bytes) to maintain backwards compatibility for units + // this.fileBufferSizeBytes = (int) conf.getSizeAsKb("spark.shuffle.file.buffer", "32k") * 1024 + this.fileBufferSizeBytes = 32 * 1024; + + if (existingInMemorySorter == null) { + RecordComparator comparator = null; + if (recordComparatorSupplier != null) { + comparator = recordComparatorSupplier.get(); + } + this.inMemSorter = + new UnsafeInMemorySorter( + this, taskMemoryManager, comparator, prefixComparator, initialSize, canUseRadixSort); + } else { + this.inMemSorter = existingInMemorySorter; + } + this.peakMemoryUsedBytes = getMemoryUsage(); + this.numElementsForSpillThreshold = numElementsForSpillThreshold; + + // Register a cleanup task with TaskContext to ensure that memory is guaranteed to be freed at + // the end of the task. This is necessary to avoid memory leaks in when the downstream operator + // does not fully consume the sorter's output (e.g. sort followed by limit). + taskContext.addTaskCompletionListener( + context -> { + cleanupResources(); + }); + } + + /** + * Marks the current page as no-more-space-available, and as a result, either allocate a new page + * or spill when we see the next record. + */ + @VisibleForTesting + public void closeCurrentPage() { + if (currentPage != null) { + pageCursor = currentPage.getBaseOffset() + currentPage.size(); + } + } + + @Override + public long forceSpill(long size, MemoryConsumer trigger) throws IOException { + if (trigger != this && readingIterator != null) { + return readingIterator.spill(); + } + if (getTaskAttemptId() != trigger.getTaskAttemptId()) { + return 0; // fail + } + + if (inMemSorter == null || inMemSorter.numRecords() <= 0) { + // There could still be some memory allocated when there are no records in the in-memory + // sorter. We will not spill it however, to ensure that we can always process at least one + // record before spilling. See the comments in `allocateMemoryForRecordIfNecessary` for why + // this is necessary. + return 0L; + } + + logger.info( + "Thread {} force spilling sort data of {} to disk ({} {} so far)", + Thread.currentThread().getId(), + Utils.bytesToString(getMemoryUsage()), + spillWriters.size(), + spillWriters.size() > 1 ? " times" : " time"); + + ShuffleWriteMetrics writeMetrics = new ShuffleWriteMetrics(); + + final UnsafeSorterSpillWriter spillWriter = + new UnsafeSorterSpillWriter( + blockManager, fileBufferSizeBytes, writeMetrics, inMemSorter.numRecords()); + spillWriters.add(spillWriter); + spillIterator(inMemSorter.getSortedIterator(), spillWriter); + + final long spillSize = freeMemory(); + // Note that this is more-or-less going to be a multiple of the page size, so wasted space in + // pages will currently be counted as memory spilled even though that space isn't actually + // written to disk. This also counts the space needed to store the sorter's pointer array. + inMemSorter.freeMemory(); + // Reset the in-memory sorter's pointer array only after freeing up the memory pages holding the + // records. Otherwise, if the task is over allocated memory, then without freeing the memory + // pages, we might not be able to get memory for the pointer array. + + taskContext.taskMetrics().incMemoryBytesSpilled(spillSize); + taskContext.taskMetrics().incDiskBytesSpilled(writeMetrics.bytesWritten()); + totalSpillBytes += spillSize; + return spillSize; + } + + /** Sort and spill the current records in response to memory pressure. */ + @Override + public long spill(long size, MemoryConsumer trigger) throws IOException { + if (trigger != this) { + if (readingIterator != null) { + return readingIterator.spill(); + } + return 0L; // this should throw exception + } + + if (inMemSorter == null || inMemSorter.numRecords() <= 0) { + // There could still be some memory allocated when there are no records in the in-memory + // sorter. We will not spill it however, to ensure that we can always process at least one + // record before spilling. See the comments in `allocateMemoryForRecordIfNecessary` for why + // this is necessary. + return 0L; + } + + logger.info( + "Thread {} spilling sort data of {} to disk ({} {} so far)", + Thread.currentThread().getId(), + Utils.bytesToString(getMemoryUsage()), + spillWriters.size(), + spillWriters.size() > 1 ? " times" : " time"); + + ShuffleWriteMetrics writeMetrics = new ShuffleWriteMetrics(); + + final UnsafeSorterSpillWriter spillWriter = + new UnsafeSorterSpillWriter( + blockManager, fileBufferSizeBytes, writeMetrics, inMemSorter.numRecords()); + spillWriters.add(spillWriter); + spillIterator(inMemSorter.getSortedIterator(), spillWriter); + + final long spillSize = freeMemory(); + // Note that this is more-or-less going to be a multiple of the page size, so wasted space in + // pages will currently be counted as memory spilled even though that space isn't actually + // written to disk. This also counts the space needed to store the sorter's pointer array. + inMemSorter.freeMemory(); + // Reset the in-memory sorter's pointer array only after freeing up the memory pages holding the + // records. Otherwise, if the task is over allocated memory, then without freeing the memory + // pages, we might not be able to get memory for the pointer array. + + taskContext.taskMetrics().incMemoryBytesSpilled(spillSize); + taskContext.taskMetrics().incDiskBytesSpilled(writeMetrics.bytesWritten()); + totalSpillBytes += spillSize; + return spillSize; + } + + /** + * Return the total memory usage of this sorter, including the data pages and the sorter's pointer + * array. + */ + private long getMemoryUsage() { + long totalPageSize = 0; + for (MemoryBlock page : allocatedPages) { + totalPageSize += page.size(); + } + return ((inMemSorter == null) ? 0 : inMemSorter.getMemoryUsage()) + totalPageSize; + } + + private void updatePeakMemoryUsed() { + long mem = getMemoryUsage(); + if (mem > peakMemoryUsedBytes) { + peakMemoryUsedBytes = mem; + } + } + + /** Return the peak memory used so far, in bytes. */ + public long getPeakMemoryUsedBytes() { + updatePeakMemoryUsed(); + return peakMemoryUsedBytes; + } + + /** @return the total amount of time spent sorting data (in-memory only). */ + public long getSortTimeNanos() { + UnsafeInMemorySorter sorter = inMemSorter; + if (sorter != null) { + return sorter.getSortTimeNanos(); + } + return totalSortTimeNanos; + } + + /** Return the total number of bytes that has been spilled into disk so far. */ + public long getSpillSize() { + return totalSpillBytes; + } + + @VisibleForTesting + public int getNumberOfAllocatedPages() { + return allocatedPages.size(); + } + + /** + * Free this sorter's data pages. + * + * @return the number of bytes freed. + */ + private long freeMemory() { + List pagesToFree = clearAndGetAllocatedPagesToFree(); + long memoryFreed = 0; + for (MemoryBlock block : pagesToFree) { + memoryFreed += block.size(); + freePage(block); + } + return memoryFreed; + } + + /** + * Clear the allocated pages and return the list of allocated pages to let the caller free the + * page. This is to prevent the deadlock by nested locks if the caller locks the + * UnsafeExternalSorter and call freePage which locks the TaskMemoryManager and cause nested + * locks. + * + * @return list of allocated pages to free + */ + private List clearAndGetAllocatedPagesToFree() { + updatePeakMemoryUsed(); + List pagesToFree = new LinkedList<>(allocatedPages); + allocatedPages.clear(); + currentPage = null; + pageCursor = 0; + return pagesToFree; + } + + /** Deletes any spill files created by this sorter. */ + private void deleteSpillFiles() { + for (UnsafeSorterSpillWriter spill : spillWriters) { + File file = spill.getFile(); + if (file != null && file.exists()) { + if (!file.delete()) { + logger.error("Was unable to delete spill file {}", file.getAbsolutePath()); + } + } + } + } + + /** Frees this sorter's in-memory data structures and cleans up its spill files. */ + public void cleanupResources() { + // To avoid deadlocks, we can't call methods that lock the TaskMemoryManager + // (such as various free() methods) while synchronizing on the UnsafeExternalSorter. + // Instead, we will manipulate UnsafeExternalSorter state inside the synchronized + // lock and perform the actual free() calls outside it. + UnsafeInMemorySorter inMemSorterToFree = null; + List pagesToFree = null; + try { + synchronized (this) { + deleteSpillFiles(); + pagesToFree = clearAndGetAllocatedPagesToFree(); + if (inMemSorter != null) { + inMemSorterToFree = inMemSorter; + inMemSorter = null; + } + } + } finally { + for (MemoryBlock pageToFree : pagesToFree) { + freePage(pageToFree); + } + if (inMemSorterToFree != null) { + inMemSorterToFree.freeMemory(); + } + } + } + + /** + * Checks whether there is enough space to insert an additional record in to the sort pointer + * array and grows the array if additional space is required. If the required space cannot be + * obtained, then the in-memory data will be spilled to disk. + */ + private void growPointerArrayIfNecessary() throws IOException { + assert (inMemSorter != null); + if (!inMemSorter.hasSpaceForAnotherRecord()) { + if (inMemSorter.numRecords() <= 0) { + // Spilling was triggered just before this method was called. The pointer array was freed + // during the spill, so a new pointer array needs to be allocated here. + LongArray array = allocateArray(inMemSorter.getInitialSize()); + inMemSorter.expandPointerArray(array); + return; + } + + long used = inMemSorter.getMemoryUsage(); + LongArray array = null; + try { + // could trigger spilling + array = allocateArray(used / 8 * 2); + } catch (TooLargePageException e) { + // The pointer array is too big to fix in a single page, spill. + spill(); + } catch (SparkOutOfMemoryError e) { + if (inMemSorter.numRecords() > 0) { + logger.error("Unable to grow the pointer array"); + throw e; + } + // The new array could not be allocated, but that is not an issue as it is longer needed, + // as all records were spilled. + } + + if (inMemSorter.numRecords() <= 0) { + // Spilling was triggered while trying to allocate the new array. + if (array != null) { + // We succeeded in allocating the new array, but, since all records were spilled, a + // smaller array would also suffice. + freeArray(array); + } + // The pointer array was freed during the spill, so a new pointer array needs to be + // allocated here. + array = allocateArray(inMemSorter.getInitialSize()); + } + inMemSorter.expandPointerArray(array); + } + } + + /** + * Allocates an additional page in order to insert an additional record. This will request + * additional memory from the memory manager and spill if the requested memory can not be + * obtained. + * + * @param required the required space in the data page, in bytes, including space for storing the + * record size. + */ + private void acquireNewPageIfNecessary(int required) { + if (currentPage == null + || pageCursor + required > currentPage.getBaseOffset() + currentPage.size()) { + // TODO: try to find space on previous pages + currentPage = allocatePage(required); + pageCursor = currentPage.getBaseOffset(); + allocatedPages.add(currentPage); + } + } + + /** + * Allocates more memory in order to insert an additional record. This will request additional + * memory from the memory manager and spill if the requested memory can not be obtained. + * + * @param required the required space in the data page, in bytes, including space for storing the + * record size. + */ + private void allocateMemoryForRecordIfNecessary(int required) throws IOException { + // Step 1: + // Ensure that the pointer array has space for another record. This may cause a spill. + growPointerArrayIfNecessary(); + // Step 2: + // Ensure that the last page has space for another record. This may cause a spill. + acquireNewPageIfNecessary(required); + // Step 3: + // The allocation in step 2 could have caused a spill, which would have freed the pointer + // array allocated in step 1. Therefore we need to check again whether we have to allocate + // a new pointer array. + // + // If the allocation in this step causes a spill event then it will not cause the page + // allocated in the previous step to be freed. The function `spill` only frees memory if at + // least one record has been inserted in the in-memory sorter. This will not be the case if + // we have spilled in the previous step. + // + // If we did not spill in the previous step then `growPointerArrayIfNecessary` will be a + // no-op that does not allocate any memory, and therefore can't cause a spill event. + // + // Thus there is no need to call `acquireNewPageIfNecessary` again after this step. + growPointerArrayIfNecessary(); + } + + /** Write a record to the sorter. */ + public void insertRecord( + Object recordBase, long recordOffset, int length, long prefix, boolean prefixIsNull) + throws IOException { + + assert (inMemSorter != null); + if (inMemSorter.numRecords() >= numElementsForSpillThreshold) { + logger.info( + "Spilling data because number of spilledRecords crossed the threshold " + + numElementsForSpillThreshold); + spill(); + } + + final int uaoSize = UnsafeAlignedOffset.getUaoSize(); + // Need 4 or 8 bytes to store the record length. + final int required = length + uaoSize; + allocateMemoryForRecordIfNecessary(required); + + final Object base = currentPage.getBaseObject(); + final long recordAddress = taskMemoryManager.encodePageNumberAndOffset(currentPage, pageCursor); + UnsafeAlignedOffset.putSize(base, pageCursor, length); + pageCursor += uaoSize; + Platform.copyMemory(recordBase, recordOffset, base, pageCursor, length); + pageCursor += length; + inMemSorter.insertRecord(recordAddress, prefix, prefixIsNull); + } + + /** + * Write a key-value record to the sorter. The key and value will be put together in-memory, using + * the following format: + * + *

record length (4 bytes), key length (4 bytes), key data, value data + * + *

record length = key length + value length + 4 + */ + public void insertKVRecord( + Object keyBase, + long keyOffset, + int keyLen, + Object valueBase, + long valueOffset, + int valueLen, + long prefix, + boolean prefixIsNull) + throws IOException { + + final int uaoSize = UnsafeAlignedOffset.getUaoSize(); + final int required = keyLen + valueLen + (2 * uaoSize); + allocateMemoryForRecordIfNecessary(required); + + final Object base = currentPage.getBaseObject(); + final long recordAddress = taskMemoryManager.encodePageNumberAndOffset(currentPage, pageCursor); + UnsafeAlignedOffset.putSize(base, pageCursor, keyLen + valueLen + uaoSize); + pageCursor += uaoSize; + UnsafeAlignedOffset.putSize(base, pageCursor, keyLen); + pageCursor += uaoSize; + Platform.copyMemory(keyBase, keyOffset, base, pageCursor, keyLen); + pageCursor += keyLen; + Platform.copyMemory(valueBase, valueOffset, base, pageCursor, valueLen); + pageCursor += valueLen; + + assert (inMemSorter != null); + inMemSorter.insertRecord(recordAddress, prefix, prefixIsNull); + } + + /** Merges another UnsafeExternalSorters into this one, the other one will be emptied. */ + public void merge(UnsafeExternalSorter other) throws IOException { + other.spill(); + totalSpillBytes += other.totalSpillBytes; + spillWriters.addAll(other.spillWriters); + // remove them from `spillWriters`, or the files will be deleted in `cleanupResources`. + other.spillWriters.clear(); + other.cleanupResources(); + } + + /** + * Returns a sorted iterator. It is the caller's responsibility to call `cleanupResources()` after + * consuming this iterator. + */ + public UnsafeSorterIterator getSortedIterator() throws IOException { + assert (recordComparatorSupplier != null); + if (spillWriters.isEmpty()) { + assert (inMemSorter != null); + readingIterator = new SpillableIterator(inMemSorter.getSortedIterator()); + return readingIterator; + } else { + final UnsafeSorterSpillMerger spillMerger = + new UnsafeSorterSpillMerger( + recordComparatorSupplier.get(), prefixComparator, spillWriters.size()); + for (UnsafeSorterSpillWriter spillWriter : spillWriters) { + spillMerger.addSpillIfNotEmpty(spillWriter.getReader(serializerManager)); + } + if (inMemSorter != null) { + readingIterator = new SpillableIterator(inMemSorter.getSortedIterator()); + spillMerger.addSpillIfNotEmpty(readingIterator); + } + return spillMerger.getSortedIterator(); + } + } + + @VisibleForTesting + boolean hasSpaceForAnotherRecord() { + return inMemSorter.hasSpaceForAnotherRecord(); + } + + private static void spillIterator( + UnsafeSorterIterator inMemIterator, UnsafeSorterSpillWriter spillWriter) throws IOException { + while (inMemIterator.hasNext()) { + inMemIterator.loadNext(); + final Object baseObject = inMemIterator.getBaseObject(); + final long baseOffset = inMemIterator.getBaseOffset(); + final int recordLength = inMemIterator.getRecordLength(); + spillWriter.write(baseObject, baseOffset, recordLength, inMemIterator.getKeyPrefix()); + } + spillWriter.close(); + } + + /** An UnsafeSorterIterator that support spilling. */ + class SpillableIterator extends UnsafeSorterIterator { + private UnsafeSorterIterator upstream; + private MemoryBlock lastPage = null; + private boolean loaded = false; + private int numRecords; + + private Object currentBaseObject; + private long currentBaseOffset; + private int currentRecordLength; + private long currentKeyPrefix; + + SpillableIterator(UnsafeSorterIterator inMemIterator) { + this.upstream = inMemIterator; + this.numRecords = inMemIterator.getNumRecords(); + } + + @Override + public int getNumRecords() { + return numRecords; + } + + @Override + public long getCurrentPageNumber() { + throw new UnsupportedOperationException(); + } + + public long spill() throws IOException { + UnsafeInMemorySorter inMemSorterToFree = null; + List pagesToFree = new LinkedList<>(); + try { + synchronized (this) { + if (inMemSorter == null) { + return 0L; + } + + long currentPageNumber = upstream.getCurrentPageNumber(); + + ShuffleWriteMetrics writeMetrics = new ShuffleWriteMetrics(); + if (numRecords > 0) { + // Iterate over the records that have not been returned and spill them. + final UnsafeSorterSpillWriter spillWriter = + new UnsafeSorterSpillWriter( + blockManager, fileBufferSizeBytes, writeMetrics, numRecords); + spillIterator(upstream, spillWriter); + spillWriters.add(spillWriter); + upstream = spillWriter.getReader(serializerManager); + } else { + // Nothing to spill as all records have been read already, but do not return yet, as the + // memory still has to be freed. + upstream = null; + } + + long released = 0L; + synchronized (UnsafeExternalSorter.this) { + // release the pages except the one that is used. There can still be a caller that + // is accessing the current record. We free this page in that caller's next loadNext() + // call. + for (MemoryBlock page : allocatedPages) { + if (!loaded || page.pageNumber != currentPageNumber) { + released += page.size(); + // Do not free the page, while we are locking `SpillableIterator`. The `freePage` + // method locks the `TaskMemoryManager`, and it's not a good idea to lock 2 objects + // in sequence. We may hit dead lock if another thread locks `TaskMemoryManager` + // and `SpillableIterator` in sequence, which may happen in + // `TaskMemoryManager.acquireExecutionMemory`. + pagesToFree.add(page); + } else { + lastPage = page; + } + } + allocatedPages.clear(); + if (lastPage != null) { + // Add the last page back to the list of allocated pages to make sure it gets freed in + // case loadNext() never gets called again. + allocatedPages.add(lastPage); + } + } + + // in-memory sorter will not be used after spilling + assert (inMemSorter != null); + released += inMemSorter.getMemoryUsage(); + totalSortTimeNanos += inMemSorter.getSortTimeNanos(); + // Do not free the sorter while we are locking `SpillableIterator`, + // as this can cause a deadlock. + inMemSorterToFree = inMemSorter; + inMemSorter = null; + taskContext.taskMetrics().incMemoryBytesSpilled(released); + taskContext.taskMetrics().incDiskBytesSpilled(writeMetrics.bytesWritten()); + totalSpillBytes += released; + return released; + } + } finally { + for (MemoryBlock pageToFree : pagesToFree) { + freePage(pageToFree); + } + if (inMemSorterToFree != null) { + inMemSorterToFree.freeMemory(); + } + } + } + + @Override + public boolean hasNext() { + return numRecords > 0; + } + + @Override + public void loadNext() throws IOException { + assert upstream != null; + MemoryBlock pageToFree = null; + try { + synchronized (this) { + loaded = true; + // Just consumed the last record from the in-memory iterator. + if (lastPage != null) { + // Do not free the page here, while we are locking `SpillableIterator`. The `freePage` + // method locks the `TaskMemoryManager`, and it's a bad idea to lock 2 objects in + // sequence. We may hit dead lock if another thread locks `TaskMemoryManager` and + // `SpillableIterator` in sequence, which may happen in + // `TaskMemoryManager.acquireExecutionMemory`. + pageToFree = lastPage; + allocatedPages.clear(); + lastPage = null; + } + numRecords--; + upstream.loadNext(); + + // Keep track of the current base object, base offset, record length, and key prefix, + // so that the current record can still be read in case a spill is triggered and we + // switch to the spill writer's iterator. + currentBaseObject = upstream.getBaseObject(); + currentBaseOffset = upstream.getBaseOffset(); + currentRecordLength = upstream.getRecordLength(); + currentKeyPrefix = upstream.getKeyPrefix(); + } + } finally { + if (pageToFree != null) { + freePage(pageToFree); + } + } + } + + @Override + public Object getBaseObject() { + return currentBaseObject; + } + + @Override + public long getBaseOffset() { + return currentBaseOffset; + } + + @Override + public int getRecordLength() { + return currentRecordLength; + } + + @Override + public long getKeyPrefix() { + return currentKeyPrefix; + } + } + + /** + * Returns an iterator starts from startIndex, which will return the rows in the order as + * inserted. + * + *

It is the caller's responsibility to call `cleanupResources()` after consuming this + * iterator. + * + *

TODO: support forced spilling + */ + public UnsafeSorterIterator getIterator(int startIndex) throws IOException { + if (spillWriters.isEmpty()) { + assert (inMemSorter != null); + UnsafeSorterIterator iter = inMemSorter.getSortedIterator(); + moveOver(iter, startIndex); + return iter; + } else { + LinkedList queue = new LinkedList<>(); + int i = 0; + for (UnsafeSorterSpillWriter spillWriter : spillWriters) { + if (i + spillWriter.recordsSpilled() > startIndex) { + UnsafeSorterIterator iter = spillWriter.getReader(serializerManager); + moveOver(iter, startIndex - i); + queue.add(iter); + } + i += spillWriter.recordsSpilled(); + } + if (inMemSorter != null && inMemSorter.numRecords() > 0) { + UnsafeSorterIterator iter = inMemSorter.getSortedIterator(); + moveOver(iter, startIndex - i); + queue.add(iter); + } + return new ChainedIterator(queue); + } + } + + private void moveOver(UnsafeSorterIterator iter, int steps) throws IOException { + if (steps > 0) { + for (int i = 0; i < steps; i++) { + if (iter.hasNext()) { + iter.loadNext(); + } else { + throw new ArrayIndexOutOfBoundsException( + "Failed to move the iterator " + steps + " steps forward"); + } + } + } + } + + /** Chain multiple UnsafeSorterIterator together as single one. */ + static class ChainedIterator extends UnsafeSorterIterator { + + private final Queue iterators; + private UnsafeSorterIterator current; + private int numRecords; + private final int[] iteratorsLength; + + ChainedIterator(Queue iterators) { + assert iterators.size() > 0; + this.numRecords = 0; + this.iteratorsLength = new int[iterators.size()]; + int i = 0; + for (UnsafeSorterIterator iter : iterators) { + this.numRecords += iter.getNumRecords(); + iteratorsLength[i++] = iter.getNumRecords(); + } + this.iterators = iterators; + this.current = iterators.remove(); + } + + int[] numRecordForEach() { + return iteratorsLength; + } + + @Override + public int getNumRecords() { + return numRecords; + } + + @Override + public long getCurrentPageNumber() { + return current.getCurrentPageNumber(); + } + + @Override + public boolean hasNext() { + while (!current.hasNext() && !iterators.isEmpty()) { + current = iterators.remove(); + } + return current.hasNext(); + } + + @Override + public void loadNext() throws IOException { + while (!current.hasNext() && !iterators.isEmpty()) { + current = iterators.remove(); + } + current.loadNext(); + } + + @Override + public Object getBaseObject() { + return current.getBaseObject(); + } + + @Override + public long getBaseOffset() { + return current.getBaseOffset(); + } + + @Override + public int getRecordLength() { + return current.getRecordLength(); + } + + @Override + public long getKeyPrefix() { + return current.getKeyPrefix(); + } + } +} diff --git a/gluten-core/src/main/java/org/apache/gluten/substrait/ddlplan/DllNode.java b/shims/spark32/src/main/scala/org/apache/gluten/utils/InternalRowUtl.scala similarity index 57% rename from gluten-core/src/main/java/org/apache/gluten/substrait/ddlplan/DllNode.java rename to shims/spark32/src/main/scala/org/apache/gluten/utils/InternalRowUtl.scala index e5878e0e33879..32d6943713936 100644 --- a/gluten-core/src/main/java/org/apache/gluten/substrait/ddlplan/DllNode.java +++ b/shims/spark32/src/main/scala/org/apache/gluten/utils/InternalRowUtl.scala @@ -14,26 +14,21 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.gluten.substrait.ddlplan; +package org.apache.gluten.utils -import io.substrait.proto.Dll; +import org.apache.spark.sql.catalyst.InternalRow +import org.apache.spark.sql.catalyst.encoders.RowEncoder +import org.apache.spark.sql.types.StructType -import java.io.Serializable; -import java.util.List; - -public class DllNode implements Serializable { - - private final List dllPlans; - - public DllNode(List dllPlans) { - this.dllPlans = dllPlans; +object InternalRowUtl { + def toString(struct: StructType, rows: Iterator[InternalRow]): String = { + val encoder = RowEncoder(struct).resolveAndBind() + val deserializer = encoder.createDeserializer() + rows.map(deserializer).mkString(System.lineSeparator()) } - public Dll toProtobuf() { - Dll.Builder dllBuilder = Dll.newBuilder(); - for (DllPlanNode dllPlanNode : dllPlans) { - dllBuilder.addDllPlan(dllPlanNode.toProtobuf()); - } - return dllBuilder.build(); + def toString(struct: StructType, rows: Iterator[InternalRow], start: Int, length: Int): String = { + toString(struct, rows.slice(start, start + length)) } + } diff --git a/shims/spark32/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcFileFormat.scala b/shims/spark32/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcFileFormat.scala index 619fa64ace6d8..940c47f1a55be 100644 --- a/shims/spark32/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcFileFormat.scala +++ b/shims/spark32/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcFileFormat.scala @@ -82,14 +82,7 @@ class OrcFileFormat extends FileFormat with DataSourceRegister with Serializable sparkSession: SparkSession, options: Map[String, String], files: Seq[FileStatus]): Option[StructType] = { - // Why if (false)? Such code requires comments when being written. - if ("true" == sparkSession.sparkContext.getLocalProperty("isNativeApplicable") && false) { - GlutenOrcWriterInjects - .getInstance() - .inferSchema(sparkSession, Map.empty[String, String], files) - } else { // the vanilla spark case - OrcUtils.inferSchema(sparkSession, files, options) - } + OrcUtils.inferSchema(sparkSession, files, options) } override def prepareWrite( diff --git a/shims/spark32/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFileFormat.scala b/shims/spark32/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFileFormat.scala index 42a63c7ebcd1d..fb03fb5f4a21a 100644 --- a/shims/spark32/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFileFormat.scala +++ b/shims/spark32/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFileFormat.scala @@ -200,12 +200,7 @@ class ParquetFileFormat extends FileFormat with DataSourceRegister with Logging sparkSession: SparkSession, parameters: Map[String, String], files: Seq[FileStatus]): Option[StructType] = { - // Why if (false)? Such code requires comments when being written. - if ("true" == sparkSession.sparkContext.getLocalProperty("isNativeApplicable") && false) { - GlutenParquetWriterInjects.getInstance().inferSchema(sparkSession, parameters, files) - } else { // the vanilla spark case - ParquetUtils.inferSchema(sparkSession, parameters, files) - } + ParquetUtils.inferSchema(sparkSession, parameters, files) } /** Returns whether the reader will return the rows as batch or not. */ diff --git a/shims/spark33/pom.xml b/shims/spark33/pom.xml index f552eddf3fb00..e17a639baa359 100644 --- a/shims/spark33/pom.xml +++ b/shims/spark33/pom.xml @@ -43,13 +43,13 @@ org.apache.spark - spark-catalyst_2.12 + spark-catalyst_${scala.binary.version} provided true org.apache.spark - spark-core_2.12 + spark-core_${scala.binary.version} provided true diff --git a/shims/spark33/src/main/scala/org/apache/gluten/utils/InternalRowUtl.scala b/shims/spark33/src/main/scala/org/apache/gluten/utils/InternalRowUtl.scala new file mode 100644 index 0000000000000..32d6943713936 --- /dev/null +++ b/shims/spark33/src/main/scala/org/apache/gluten/utils/InternalRowUtl.scala @@ -0,0 +1,34 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.gluten.utils + +import org.apache.spark.sql.catalyst.InternalRow +import org.apache.spark.sql.catalyst.encoders.RowEncoder +import org.apache.spark.sql.types.StructType + +object InternalRowUtl { + def toString(struct: StructType, rows: Iterator[InternalRow]): String = { + val encoder = RowEncoder(struct).resolveAndBind() + val deserializer = encoder.createDeserializer() + rows.map(deserializer).mkString(System.lineSeparator()) + } + + def toString(struct: StructType, rows: Iterator[InternalRow], start: Int, length: Int): String = { + toString(struct, rows.slice(start, start + length)) + } + +} diff --git a/shims/spark33/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcFileFormat.scala b/shims/spark33/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcFileFormat.scala index 9891f6851d009..08463ab6111da 100644 --- a/shims/spark33/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcFileFormat.scala +++ b/shims/spark33/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcFileFormat.scala @@ -65,12 +65,7 @@ class OrcFileFormat extends FileFormat with DataSourceRegister with Serializable sparkSession: SparkSession, options: Map[String, String], files: Seq[FileStatus]): Option[StructType] = { - // Why if (false)? Such code requires comments when being written. - if ("true" == sparkSession.sparkContext.getLocalProperty("isNativeApplicable") && false) { - GlutenOrcWriterInjects.getInstance().inferSchema(sparkSession, options, files) - } else { // the vanilla spark case - OrcUtils.inferSchema(sparkSession, files, options) - } + OrcUtils.inferSchema(sparkSession, files, options) } override def prepareWrite( diff --git a/shims/spark33/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFileFormat.scala b/shims/spark33/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFileFormat.scala index 403e31c1cb302..8954f58233916 100644 --- a/shims/spark33/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFileFormat.scala +++ b/shims/spark33/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFileFormat.scala @@ -196,12 +196,7 @@ class ParquetFileFormat extends FileFormat with DataSourceRegister with Logging sparkSession: SparkSession, parameters: Map[String, String], files: Seq[FileStatus]): Option[StructType] = { - // Why if (false)? Such code requires comments when being written. - if ("true" == sparkSession.sparkContext.getLocalProperty("isNativeApplicable") && false) { - GlutenParquetWriterInjects.getInstance().inferSchema(sparkSession, parameters, files) - } else { // the vanilla spark case - ParquetUtils.inferSchema(sparkSession, parameters, files) - } + ParquetUtils.inferSchema(sparkSession, parameters, files) } /** Returns whether the reader will return the rows as batch or not. */ diff --git a/shims/spark34/pom.xml b/shims/spark34/pom.xml index 5d6a5dd1960f8..bd1de9fe8561d 100644 --- a/shims/spark34/pom.xml +++ b/shims/spark34/pom.xml @@ -43,13 +43,13 @@ org.apache.spark - spark-catalyst_2.12 + spark-catalyst_${scala.binary.version} provided true org.apache.spark - spark-core_2.12 + spark-core_${scala.binary.version} provided true diff --git a/shims/spark34/src/main/scala/org/apache/gluten/utils/InternalRowUtl.scala b/shims/spark34/src/main/scala/org/apache/gluten/utils/InternalRowUtl.scala new file mode 100644 index 0000000000000..32d6943713936 --- /dev/null +++ b/shims/spark34/src/main/scala/org/apache/gluten/utils/InternalRowUtl.scala @@ -0,0 +1,34 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.gluten.utils + +import org.apache.spark.sql.catalyst.InternalRow +import org.apache.spark.sql.catalyst.encoders.RowEncoder +import org.apache.spark.sql.types.StructType + +object InternalRowUtl { + def toString(struct: StructType, rows: Iterator[InternalRow]): String = { + val encoder = RowEncoder(struct).resolveAndBind() + val deserializer = encoder.createDeserializer() + rows.map(deserializer).mkString(System.lineSeparator()) + } + + def toString(struct: StructType, rows: Iterator[InternalRow], start: Int, length: Int): String = { + toString(struct, rows.slice(start, start + length)) + } + +} diff --git a/shims/spark35/src/main/scala/org/apache/gluten/utils/InternalRowUtl.scala b/shims/spark35/src/main/scala/org/apache/gluten/utils/InternalRowUtl.scala new file mode 100644 index 0000000000000..654e43cbd03f5 --- /dev/null +++ b/shims/spark35/src/main/scala/org/apache/gluten/utils/InternalRowUtl.scala @@ -0,0 +1,33 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.gluten.utils + +import org.apache.spark.sql.catalyst.InternalRow +import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder +import org.apache.spark.sql.types.StructType + +object InternalRowUtl { + def toString(struct: StructType, rows: Iterator[InternalRow]): String = { + val encoder = ExpressionEncoder(struct).resolveAndBind() + val deserializer = encoder.createDeserializer() + rows.map(deserializer).mkString(System.lineSeparator()) + } + + def toString(struct: StructType, rows: Iterator[InternalRow], start: Int, length: Int): String = { + toString(struct, rows.slice(start, start + length)) + } +} diff --git a/tools/gluten-it/common/src/main/java/org/apache/gluten/integration/SparkJvmOptions.java b/tools/gluten-it/common/src/main/java/org/apache/gluten/integration/SparkJvmOptions.java new file mode 100644 index 0000000000000..85c0912fd7c7a --- /dev/null +++ b/tools/gluten-it/common/src/main/java/org/apache/gluten/integration/SparkJvmOptions.java @@ -0,0 +1,42 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.gluten.integration; + +import java.lang.reflect.InvocationTargetException; +import java.lang.reflect.Method; + +public class SparkJvmOptions { + private static final String MODULE_OPTIONS_CLASS_NAME = "org.apache.spark.launcher.JavaModuleOptions"; + + public static String read() { + try { + final Class clazz = Class.forName("org.apache.spark.launcher.JavaModuleOptions"); + final Method method = clazz.getMethod("defaultModuleOptions"); + return (String) method.invoke(null); + } catch (ClassNotFoundException e) { + // Could happen in Spark 3.2 which doesn't have this class yet. + return ""; + } catch (NoSuchMethodException | InvocationTargetException | IllegalAccessException e) { + throw new RuntimeException(e); + } + } + + public static void main(String[] args) { + System.out.println(read()); + } +} diff --git a/tools/gluten-it/common/src/main/java/org/apache/gluten/integration/command/SparkRunModes.java b/tools/gluten-it/common/src/main/java/org/apache/gluten/integration/command/SparkRunModes.java index 6750b90e9e495..d186b5d0b1d63 100644 --- a/tools/gluten-it/common/src/main/java/org/apache/gluten/integration/command/SparkRunModes.java +++ b/tools/gluten-it/common/src/main/java/org/apache/gluten/integration/command/SparkRunModes.java @@ -332,6 +332,7 @@ public String getSparkMasterUrl() { @Override public Map extraSparkConf() { final Map extras = new HashMap<>(); + extras.put(SparkLauncher.EXECUTOR_DEFAULT_JAVA_OPTIONS, "-Dio.netty.tryReflectionSetAccessible=true"); extras.put(SparkLauncher.EXECUTOR_CORES, String.valueOf(resourceEnumeration.lcExecutorCores())); extras.put(SparkLauncher.EXECUTOR_MEMORY, String.format("%dm", resourceEnumeration.lcExecutorHeapMem())); extras.put("spark.memory.offHeap.enabled", "true"); diff --git a/tools/gluten-it/sbin/gluten-it.sh b/tools/gluten-it/sbin/gluten-it.sh index 00ff78e349977..8c1a6413b5ec2 100755 --- a/tools/gluten-it/sbin/gluten-it.sh +++ b/tools/gluten-it/sbin/gluten-it.sh @@ -16,8 +16,6 @@ set -euf -GLUTEN_IT_JVM_ARGS=${GLUTEN_IT_JVM_ARGS:-"-Xmx2G -XX:ErrorFile=/var/log/java/hs_err_pid%p.log"} - BASEDIR=$(dirname $0) LIB_DIR=$BASEDIR/../package/target/lib @@ -28,32 +26,25 @@ fi JAR_PATH=$LIB_DIR/* +SPARK_JVM_OPTIONS=$($JAVA_HOME/bin/java -cp $JAR_PATH org.apache.gluten.integration.SparkJvmOptions) + EMBEDDED_SPARK_HOME=$BASEDIR/../spark-home +# We temporarily disallow setting these two variables by caller. +SPARK_HOME="" +SPARK_SCALA_VERSION="" export SPARK_HOME=${SPARK_HOME:-$EMBEDDED_SPARK_HOME} export SPARK_SCALA_VERSION=${SPARK_SCALA_VERSION:-'2.12'} echo "SPARK_HOME set at [$SPARK_HOME]." echo "SPARK_SCALA_VERSION set at [$SPARK_SCALA_VERSION]." -$JAVA_HOME/bin/java $GLUTEN_IT_JVM_ARGS \ - -XX:+IgnoreUnrecognizedVMOptions \ - --add-opens=java.base/java.lang=ALL-UNNAMED \ - --add-opens=java.base/java.lang.invoke=ALL-UNNAMED \ - --add-opens=java.base/java.lang.reflect=ALL-UNNAMED \ - --add-opens=java.base/java.io=ALL-UNNAMED \ - --add-opens=java.base/java.net=ALL-UNNAMED \ - --add-opens=java.base/java.nio=ALL-UNNAMED \ - --add-opens=java.base/java.util=ALL-UNNAMED \ - --add-opens=java.base/java.util.concurrent=ALL-UNNAMED \ - --add-opens=java.base/java.util.concurrent.atomic=ALL-UNNAMED \ - --add-opens=java.base/jdk.internal.ref=ALL-UNNAMED \ - --add-opens=java.base/jdk.internal.misc=ALL-UNNAMED \ - --add-opens=java.base/sun.nio.ch=ALL-UNNAMED \ - --add-opens=java.base/sun.nio.cs=ALL-UNNAMED \ - --add-opens=java.base/sun.security.action=ALL-UNNAMED \ - --add-opens=java.base/sun.util.calendar=ALL-UNNAMED \ - -Djdk.reflect.useDirectMethodHandle=false \ +GLUTEN_IT_JVM_ARGS=${GLUTEN_IT_JVM_ARGS:-"-Xmx2G"} + +$JAVA_HOME/bin/java \ + $SPARK_JVM_OPTIONS \ + $GLUTEN_IT_JVM_ARGS \ + -XX:ErrorFile=/var/log/java/hs_err_pid%p.log \ -Dio.netty.tryReflectionSetAccessible=true \ -cp $JAR_PATH \ org.apache.gluten.integration.Cli $@