Skip to content

[VL] Add support to write parquet files to GCS #7701

[VL] Add support to write parquet files to GCS

[VL] Add support to write parquet files to GCS #7701

Workflow file for this run

# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
name: Velox backend
on:
pull_request:
paths:
- '.github/**'
- 'pom.xml'
- 'backends-velox/**'
- 'gluten-celeborn/**'
- 'gluten-core/**'
- 'gluten-data/**'
- 'gluten-delta/**'
- 'gluten-ut/**'
- 'shims/**'
- 'tools/gluten-it/**'
- 'tools/gluten-te/**'
- 'ep/build-velox/**'
- 'cpp/*'
- 'cpp/CMake/**'
- 'cpp/velox/**'
- 'cpp/core/**'
- 'dev/**'
# - 'substrait/substrait-spark/**'
env:
HTTP_PROXY_HOST: 10.240.252.16
HTTP_PROXY_PORT: 911
DOCKER_REGISTRY_HOST: 10.0.2.4
DOCKER_REGISTRY_PORT: 5000
concurrency:
group: ${{ github.repository }}-${{ github.head_ref || github.sha }}-${{ github.workflow }}
cancel-in-progress: true
jobs:
ubuntu2004-test-spark32:
runs-on: velox-self-hosted
steps:
- uses: actions/checkout@v4
- name: Setup docker container
run: |
docker run --rm --init --privileged --ulimit nofile=65536:65536 --ulimit core=-1 --security-opt seccomp=unconfined \
-v $PWD:/opt/gluten --name ubuntu2004-test-$GITHUB_RUN_ID -e NUM_THREADS=30 -detach $DOCKER_REGISTRY_HOST:$DOCKER_REGISTRY_PORT/gluten-dev/ubuntu:20.04 \
'cd /opt/gluten && sleep 14400'
- name: Build Gluten velox third party
run: |
docker exec ubuntu2004-test-$GITHUB_RUN_ID bash -c '
cd /opt/gluten/ep/build-velox/src && \
./get_velox.sh --velox_home=/opt/velox && \
./build_velox.sh --velox_home=/opt/velox --enable_ep_cache=ON --build_test_utils=ON'
- name: Build Gluten CPP library
run: |
docker exec ubuntu2004-test-$GITHUB_RUN_ID bash -c '
cd /opt/gluten/cpp && \
./compile.sh --build_velox_backend=ON --velox_home=/opt/velox --build_tests=ON --build_examples=ON --build_benchmarks=ON'
- name: Run CPP unit test
run: |
docker exec ubuntu2004-test-$GITHUB_RUN_ID bash -c 'cd /opt/gluten/cpp/build && \
ctest -V'
- name: Run HBM CPP unit test
run: |
docker exec ubuntu2004-test-$GITHUB_RUN_ID bash -c 'cd /opt/gluten/cpp/build && \
cmake -DBUILD_TESTS=ON -DENABLE_HBM=ON .. && \
cmake --build . --target hbw_allocator_test -- -j && \
ctest -V -R TestHbw'
- name: Build and run unit test for Spark 3.2.2 (other tests)
run: |
docker exec ubuntu2004-test-$GITHUB_RUN_ID bash -c '
cd /opt/gluten && \
mvn clean install -Pspark-3.2 -Pspark-ut -Pbackends-velox -Prss -DargLine="-Dspark.test.home=/opt/spark322" -DtagsToExclude=org.apache.spark.tags.ExtendedSQLTest,io.glutenproject.tags.UDFTest,io.glutenproject.tags.SkipTestTags && \
mvn test -Pspark-3.2 -Pbackends-velox -DtagsToExclude=None -DtagsToInclude=io.glutenproject.tags.UDFTest'
# Cpp micro benchmarks will use generated files from unit test in backends-velox module.
- name: Run micro benchmarks
run: |
docker exec ubuntu2004-test-$GITHUB_RUN_ID bash -c 'cd /opt/gluten/cpp/build/velox/benchmarks && \
./generic_benchmark --with-shuffle --threads 1 --iterations 1'
- name: Exit docker container
if: ${{ always() }}
run: |
docker stop ubuntu2004-test-$GITHUB_RUN_ID || true
ubuntu2004-test-spark32-slow:
runs-on: velox-self-hosted
steps:
- uses: actions/checkout@v4
- name: Setup docker container
run: |
docker run --rm --init --privileged --ulimit nofile=65536:65536 --ulimit core=-1 --security-opt seccomp=unconfined \
-v $PWD:/opt/gluten --name ubuntu2004-test-slow-$GITHUB_RUN_ID -e NUM_THREADS=30 -detach $DOCKER_REGISTRY_HOST:$DOCKER_REGISTRY_PORT/gluten-dev/ubuntu:20.04 \
'cd /opt/gluten && sleep 14400'
- name: Build Gluten velox third party
run: |
docker exec ubuntu2004-test-slow-$GITHUB_RUN_ID bash -c '
cd /opt/gluten/ep/build-velox/src && \
./get_velox.sh --velox_home=/opt/velox && \
./build_velox.sh --velox_home=/opt/velox --enable_ep_cache=ON'
- name: Build Gluten CPP library
run: |
docker exec ubuntu2004-test-slow-$GITHUB_RUN_ID bash -c '
cd /opt/gluten/cpp && \
./compile.sh --build_velox_backend=ON --velox_home=/opt/velox'
- name: Build and run unit test for Spark 3.2.2(slow tests)
run: |
docker exec ubuntu2004-test-slow-$GITHUB_RUN_ID bash -c '
cd /opt/gluten && \
mvn clean install -Pspark-3.2 -Pspark-ut -Pbackends-velox -Prss -Piceberg -Pdelta -DargLine="-Dspark.test.home=/opt/spark322" -DtagsToInclude=org.apache.spark.tags.ExtendedSQLTest'
- name: TPC-H SF1.0 && TPC-DS SF1.0 Parquet local spark3.2
run: |
docker exec ubuntu2004-test-slow-$GITHUB_RUN_ID bash -c 'cd /opt/gluten/tools/gluten-it && \
mvn clean install -Pspark-3.2 \
&& GLUTEN_IT_JVM_ARGS=-Xmx5G sbin/gluten-it.sh queries-compare \
--local --preset=velox --benchmark-type=h --error-on-memleak --disable-aqe --off-heap-size=10g -s=1.0 --threads=16 --iterations=1 \
&& GLUTEN_IT_JVM_ARGS=-Xmx5G sbin/gluten-it.sh queries-compare \
--local --preset=velox --benchmark-type=ds --error-on-memleak --off-heap-size=10g -s=1.0 --threads=16 --iterations=1'
- name: Exit docker container
if: ${{ always() }}
run: |
docker stop ubuntu2004-test-slow-$GITHUB_RUN_ID || true
ubuntu2004-test-spark33-slow:
runs-on: velox-self-hosted
steps:
- uses: actions/checkout@v4
- name: Setup docker container
run: |
docker run --rm --init --privileged --ulimit nofile=65536:65536 --ulimit core=-1 --security-opt seccomp=unconfined \
-v $PWD:/opt/gluten --name ubuntu2004-test-spark33-slow-$GITHUB_RUN_ID -e NUM_THREADS=30 -detach $DOCKER_REGISTRY_HOST:$DOCKER_REGISTRY_PORT/gluten-dev/ubuntu:20.04 \
'cd /opt/gluten && sleep 14400'
- name: Build Gluten velox third party
run: |
docker exec ubuntu2004-test-spark33-slow-$GITHUB_RUN_ID bash -l -c '
cd /opt/gluten/ep/build-velox/src && \
./get_velox.sh --velox_home=/opt/velox && \
./build_velox.sh --velox_home=/opt/velox --enable_ep_cache=ON'
- name: Build Gluten CPP library
run: |
docker exec ubuntu2004-test-spark33-slow-$GITHUB_RUN_ID bash -l -c '
cd /opt/gluten/cpp && \
./compile.sh --build_velox_backend=ON --velox_home=/opt/velox'
- name: Build and Run unit test for Spark 3.3.1(slow tests)
run: |
docker exec ubuntu2004-test-spark33-slow-$GITHUB_RUN_ID bash -l -c 'cd /opt/gluten && \
mvn clean install -Pspark-3.3 -Pbackends-velox -Prss -Piceberg -Pdelta -Pspark-ut -DargLine="-Dspark.test.home=/opt/spark331" -DtagsToInclude=org.apache.spark.tags.ExtendedSQLTest'
- name: TPC-H SF1.0 && TPC-DS SF1.0 Parquet local spark3.3
run: |
docker exec ubuntu2004-test-spark33-slow-$GITHUB_RUN_ID bash -l -c 'cd /opt/gluten/tools/gluten-it && \
mvn clean install -Pspark-3.3 \
&& GLUTEN_IT_JVM_ARGS=-Xmx5G sbin/gluten-it.sh queries-compare \
--local --preset=velox --benchmark-type=h --error-on-memleak --disable-aqe --off-heap-size=10g -s=1.0 --threads=16 --iterations=1 \
&& GLUTEN_IT_JVM_ARGS=-Xmx5G sbin/gluten-it.sh queries-compare \
--local --preset=velox --benchmark-type=ds --error-on-memleak --off-heap-size=10g -s=1.0 --threads=16 --iterations=1'
- name: Exit docker container
if: ${{ always() }}
run: |
docker stop ubuntu2004-test-spark33-slow-$GITHUB_RUN_ID || true
ubuntu2004-test-spark33:
runs-on: velox-self-hosted
steps:
- uses: actions/checkout@v4
- name: Setup docker container
run: |
docker run --rm --init --privileged --ulimit nofile=65536:65536 --ulimit core=-1 --security-opt seccomp=unconfined \
-v $PWD:/opt/gluten --name ubuntu2004-test-spark33-$GITHUB_RUN_ID -e NUM_THREADS=30 -detach $DOCKER_REGISTRY_HOST:$DOCKER_REGISTRY_PORT/gluten-dev/ubuntu:20.04 \
'cd /opt/gluten && sleep 14400'
- name: Build Gluten velox third party
run: |
docker exec ubuntu2004-test-spark33-$GITHUB_RUN_ID bash -c '
cd /opt/gluten/ep/build-velox/src && \
./get_velox.sh --velox_home=/opt/velox && \
./build_velox.sh --velox_home=/opt/velox --enable_ep_cache=ON'
- name: Build Gluten CPP library
run: |
docker exec ubuntu2004-test-spark33-$GITHUB_RUN_ID bash -c '
cd /opt/gluten/cpp && \
./compile.sh --build_velox_backend=ON --velox_home=/opt/velox --build_examples=ON'
- name: Build and Run unit test for Spark 3.3.1(other tests)
run: |
docker exec ubuntu2004-test-spark33-$GITHUB_RUN_ID bash -c 'cd /opt/gluten && \
mvn clean install -Pspark-3.3 -Pbackends-velox -Prss -Piceberg -Pdelta -Pspark-ut -DargLine="-Dspark.test.home=/opt/spark331" -DtagsToExclude=org.apache.spark.tags.ExtendedSQLTest,io.glutenproject.tags.UDFTest,io.glutenproject.tags.SkipTestTags && \
mvn test -Pspark-3.3 -Pbackends-velox -DtagsToExclude=None -DtagsToInclude=io.glutenproject.tags.UDFTest'
- name: Exit docker container
if: ${{ always() }}
run: |
docker stop ubuntu2004-test-spark33-$GITHUB_RUN_ID || true
ubuntu2004-test-spark34-slow:
runs-on: velox-self-hosted
steps:
- uses: actions/checkout@v4
- name: Setup docker container
run: |
docker run --rm --init --privileged --ulimit nofile=65536:65536 --ulimit core=-1 --security-opt seccomp=unconfined \
-v $PWD:/opt/gluten --name ubuntu2004-test-spark34-slow-$GITHUB_RUN_ID -e NUM_THREADS=30 -detach $DOCKER_REGISTRY_HOST:$DOCKER_REGISTRY_PORT/gluten-dev/ubuntu:20.04 \
'cd /opt/gluten && sleep 14400'
- name: Build Gluten velox third party
run: |
docker exec ubuntu2004-test-spark34-slow-$GITHUB_RUN_ID bash -l -c '
cd /opt/gluten/ep/build-velox/src && \
./get_velox.sh --velox_home=/opt/velox && \
./build_velox.sh --velox_home=/opt/velox --enable_ep_cache=ON'
- name: Build Gluten CPP library
run: |
docker exec ubuntu2004-test-spark34-slow-$GITHUB_RUN_ID bash -l -c '
cd /opt/gluten/cpp && \
./compile.sh --build_velox_backend=ON --velox_home=/opt/velox '
- name: Build and Run unit test for Spark 3.4.1(slow tests)
run: |
docker exec ubuntu2004-test-spark34-slow-$GITHUB_RUN_ID bash -l -c 'cd /opt/gluten && \
mvn clean install -Pspark-3.4 -Pbackends-velox -Prss -Piceberg -Pdelta -Pspark-ut -DargLine="-Dspark.test.home=/opt/spark331" -DtagsToInclude=org.apache.spark.tags.ExtendedSQLTest'
- name: TPC-H SF1.0 && TPC-DS SF1.0 Parquet local spark3.4
run: |
docker exec ubuntu2004-test-spark34-slow-$GITHUB_RUN_ID bash -l -c 'cd /opt/gluten/tools/gluten-it && \
mvn clean install -Pspark-3.4 \
&& GLUTEN_IT_JVM_ARGS=-Xmx5G sbin/gluten-it.sh queries-compare \
--local --preset=velox --benchmark-type=h --error-on-memleak --disable-aqe --off-heap-size=10g -s=1.0 --threads=16 --iterations=1 \
&& GLUTEN_IT_JVM_ARGS=-Xmx5G sbin/gluten-it.sh queries-compare \
--local --preset=velox --benchmark-type=ds --error-on-memleak --off-heap-size=10g -s=1.0 --threads=16 --iterations=1'
- name: Exit docker container
if: ${{ always() }}
run: |
docker stop ubuntu2004-test-spark34-slow-$GITHUB_RUN_ID || true
ubuntu2004-test-spark34:
runs-on: velox-self-hosted
steps:
- uses: actions/checkout@v4
- name: Setup docker container
run: |
docker run --rm --init --privileged --ulimit nofile=65536:65536 --ulimit core=-1 --security-opt seccomp=unconfined \
-v $PWD:/opt/gluten --name ubuntu2004-test-spark34-$GITHUB_RUN_ID -e NUM_THREADS=30 -detach $DOCKER_REGISTRY_HOST:$DOCKER_REGISTRY_PORT/gluten-dev/ubuntu:20.04 \
'cd /opt/gluten && sleep 14400'
- name: Build Gluten velox third party
run: |
docker exec ubuntu2004-test-spark34-$GITHUB_RUN_ID bash -c '
cd /opt/gluten/ep/build-velox/src && \
./get_velox.sh --velox_home=/opt/velox && \
./build_velox.sh --velox_home=/opt/velox --enable_ep_cache=ON'
- name: Build Gluten CPP library
run: |
docker exec ubuntu2004-test-spark34-$GITHUB_RUN_ID bash -c '
cd /opt/gluten/cpp && \
./compile.sh --build_velox_backend=ON --velox_home=/opt/velox --build_examples=ON'
- name: Build and Run unit test for Spark 3.4.1(other tests)
run: |
docker exec ubuntu2004-test-spark34-$GITHUB_RUN_ID bash -c 'cd /opt/gluten && \
mvn clean install -Pspark-3.4 -Pbackends-velox -Prss -Piceberg -Pdelta -Pspark-ut -DargLine="-Dspark.test.home=/opt/spark331" -DtagsToExclude=org.apache.spark.tags.ExtendedSQLTest,io.glutenproject.tags.UDFTest,io.glutenproject.tags.SkipTestTags && \
mvn test -Pspark-3.4 -Pbackends-velox -DtagsToExclude=None -DtagsToInclude=io.glutenproject.tags.UDFTest'
- name: Exit docker container
if: ${{ always() }}
run: |
docker stop ubuntu2004-test-spark34-$GITHUB_RUN_ID || true
ubuntu2204-test-spark33-spark34:
runs-on: velox-self-hosted
steps:
- uses: actions/checkout@v4
- name: Setup docker container
run: |
docker run --rm --init --privileged --ulimit nofile=65536:65536 --ulimit core=-1 --security-opt seccomp=unconfined \
-v $PWD:/opt/gluten --name ubuntu2204-test-spark33-spark34-$GITHUB_RUN_ID -e NUM_THREADS=30 -detach $DOCKER_REGISTRY_HOST:$DOCKER_REGISTRY_PORT/gluten-dev/ubuntu:22.04 \
'cd /opt/gluten && sleep 14400'
- name: Build Gluten velox third party
run: |
docker exec ubuntu2204-test-spark33-spark34-$GITHUB_RUN_ID bash -c '
cd /opt/gluten/ep/build-velox/src && \
./get_velox.sh --velox_home=/opt/velox --enable_hdfs=ON --enable_s3=ON --enable_gcs=ON --enable_abfs=OFF && \
./build_velox.sh --velox_home=/opt/velox --enable_ep_cache=ON --enable_hdfs=ON --enable_s3=ON --enable_gcs=ON --enable_abfs=OFF'
- name: Build Gluten CPP library
run: |
docker exec ubuntu2204-test-spark33-spark34-$GITHUB_RUN_ID bash -c '
cd /opt/gluten/cpp && \
./compile.sh --build_velox_backend=ON --velox_home=/opt/velox --enable_hdfs=ON --enable_s3=ON --enable_gcs=ON --enable_abfs=OFF'
- name: Build for Spark 3.3.1
run: |
docker exec ubuntu2204-test-spark33-spark34-$GITHUB_RUN_ID bash -c '
cd /opt/gluten && \
mvn clean install -Pspark-3.3 -Pbackends-velox -Prss -Piceberg -Pdelta -DskipTests'
- name: TPC-H SF1.0 && TPC-DS SF10.0 Parquet local spark3.3
run: |
docker exec ubuntu2204-test-spark33-spark34-$GITHUB_RUN_ID bash -c 'cd /opt/gluten/tools/gluten-it && \
mvn clean install -Pspark-3.3 \
&& GLUTEN_IT_JVM_ARGS=-Xmx5G sbin/gluten-it.sh queries-compare \
--local --preset=velox --benchmark-type=h --error-on-memleak --disable-aqe --off-heap-size=10g -s=1.0 --threads=16 --iterations=1 \
&& GLUTEN_IT_JVM_ARGS=-Xmx20G sbin/gluten-it.sh queries-compare \
--local --preset=velox --benchmark-type=ds --error-on-memleak --off-heap-size=30g -s=10.0 --threads=32 --iterations=1'
- name: Build for Spark 3.4.1
run: |
docker exec ubuntu2204-test-spark33-spark34-$GITHUB_RUN_ID bash -c '
cd /opt/gluten && \
mvn clean install -Pspark-3.4 -Pbackends-velox -Prss -Piceberg -Pdelta -DskipTests'
- name: TPC-H SF1.0 && TPC-DS SF10.0 Parquet local spark3.4
run: |
docker exec ubuntu2204-test-spark33-spark34-$GITHUB_RUN_ID bash -c 'cd /opt/gluten/tools/gluten-it && \
mvn clean install -Pspark-3.4 \
&& GLUTEN_IT_JVM_ARGS=-Xmx5G sbin/gluten-it.sh queries-compare \
--local --preset=velox --benchmark-type=h --error-on-memleak --disable-aqe --off-heap-size=10g -s=1.0 --threads=16 --iterations=1 \
&& GLUTEN_IT_JVM_ARGS=-Xmx20G sbin/gluten-it.sh queries-compare \
--local --preset=velox --benchmark-type=ds --error-on-memleak --off-heap-size=30g -s=10.0 --threads=32 --iterations=1'
- name: Exit docker container
if: ${{ always() }}
run: |
docker stop ubuntu2204-test-spark33-spark34-$GITHUB_RUN_ID || true
ubuntu2204-test:
runs-on: velox-self-hosted
steps:
- uses: actions/checkout@v4
- name: Setup docker container
run: |
docker run --rm --init --privileged --ulimit nofile=65536:65536 --ulimit core=-1 --security-opt seccomp=unconfined \
-v $PWD:/opt/gluten --name ubuntu2204-test-$GITHUB_RUN_ID -e NUM_THREADS=30 -detach $DOCKER_REGISTRY_HOST:$DOCKER_REGISTRY_PORT/gluten-dev/ubuntu:22.04 \
'cd /opt/gluten && sleep 14400'
- name: Build Gluten velox third party
run: |
docker exec ubuntu2204-test-$GITHUB_RUN_ID bash -c '
cd /opt/gluten/ep/build-velox/src && \
./get_velox.sh --velox_home=/opt/velox --enable_hdfs=ON --enable_s3=ON --enable_gcs=ON --enable_abfs=OFF && \
./build_velox.sh --velox_home=/opt/velox --enable_ep_cache=ON --enable_hdfs=ON --enable_s3=ON --enable_gcs=ON --enable_abfs=OFF'
- name: Build Gluten CPP library
run: |
docker exec ubuntu2204-test-$GITHUB_RUN_ID bash -c '
cd /opt/gluten/cpp && \
./compile.sh --build_velox_backend=ON --velox_home=/opt/velox --enable_hdfs=ON --enable_s3=ON --enable_gcs=ON --enable_abfs=OFF'
- name: Build for Spark 3.2.2
run: |
docker exec ubuntu2204-test-$GITHUB_RUN_ID bash -c '
cd /opt/gluten && \
mvn clean install -Pspark-3.2 -Pbackends-velox -Prss -Piceberg -Pdelta -DskipTests'
- name: TPC-H SF1.0 && TPC-DS SF10.0 Parquet local spark3.2
run: |
docker exec ubuntu2204-test-$GITHUB_RUN_ID bash -c 'cd /opt/gluten/tools/gluten-it && \
mvn clean install -Pspark-3.2 \
&& GLUTEN_IT_JVM_ARGS=-Xmx5G sbin/gluten-it.sh queries-compare \
--local --preset=velox --benchmark-type=h --error-on-memleak --disable-aqe --off-heap-size=10g -s=1.0 --threads=16 --iterations=1 \
&& GLUTEN_IT_JVM_ARGS=-Xmx20G sbin/gluten-it.sh queries-compare \
--local --preset=velox --benchmark-type=ds --error-on-memleak --off-heap-size=40g -s=10.0 --threads=32 --iterations=1'
- name: TPC-H SF1.0 && TPC-DS SF10.0 Parquet local spark3.2 with Celeborn
run: |
docker exec ubuntu2204-test-$GITHUB_RUN_ID bash -c \
'mv /opt/apache-celeborn-0.3.0-incubating-bin/conf/celeborn-env.sh.template /opt/apache-celeborn-0.3.0-incubating-bin/conf/celeborn-env.sh && \
echo -e "CELEBORN_MASTER_MEMORY=4g\nCELEBORN_WORKER_MEMORY=4g\nCELEBORN_WORKER_OFFHEAP_MEMORY=8g" > /opt/apache-celeborn-0.3.0-incubating-bin/conf/celeborn-env.sh && \
echo -e "celeborn.worker.commitFiles.threads 128\nceleborn.worker.sortPartition.threads 64" > /opt/apache-celeborn-0.3.0-incubating-bin/conf/celeborn-defaults.conf \
&& bash /opt/apache-celeborn-0.3.0-incubating-bin/sbin/start-master.sh && bash /opt/apache-celeborn-0.3.0-incubating-bin/sbin/start-worker.sh && \
cd /opt/gluten/tools/gluten-it && mvn clean install -Pspark-3.2,rss \
&& GLUTEN_IT_JVM_ARGS=-Xmx5G sbin/gluten-it.sh queries-compare \
--local --preset=velox-with-celeborn --benchmark-type=h --error-on-memleak --disable-aqe --off-heap-size=10g -s=1.0 --threads=16 --iterations=1 \
&& GLUTEN_IT_JVM_ARGS=-Xmx5G sbin/gluten-it.sh queries-compare \
--local --preset=velox-with-celeborn --benchmark-type=ds --error-on-memleak --off-heap-size=10g -s=1.0 --threads=16 --iterations=1 && \
bash /opt/apache-celeborn-0.3.0-incubating-bin/sbin/stop-worker.sh \
&& bash /opt/apache-celeborn-0.3.0-incubating-bin/sbin/stop-master.sh'
- name: Exit docker container
if: ${{ always() }}
run: |
docker stop ubuntu2204-test-$GITHUB_RUN_ID || true
centos8-test:
runs-on: velox-self-hosted
steps:
- uses: actions/checkout@v4
- name: Setup docker container
run: |
docker run --rm --init --privileged --ulimit nofile=65536:65536 --ulimit core=-1 --security-opt seccomp=unconfined \
-v $PWD:/opt/gluten --name centos8-test-$GITHUB_RUN_ID -e NUM_THREADS=30 -detach $DOCKER_REGISTRY_HOST:$DOCKER_REGISTRY_PORT/gluten-dev/centos:8 \
bash -c 'cd /opt/gluten && sleep 14400'
- name: Build Gluten velox third party
run: |
docker exec centos8-test-$GITHUB_RUN_ID bash -c '
source /env.sh && \
sudo yum -y install patch && \
cd /opt/gluten/ep/build-velox/src && \
./get_velox.sh --velox_home=/opt/velox --enable_hdfs=ON --enable_s3=ON --enable_gcs=ON --enable_abfs=OFF && \
./build_velox.sh --velox_home=/opt/velox --enable_ep_cache=ON --enable_hdfs=ON --enable_s3=ON --enable_gcs=ON --enable_abfs=OFF'
- name: Build Gluten CPP library
run: |
docker exec centos8-test-$GITHUB_RUN_ID bash -c '
source /env.sh && \
cd /opt/gluten/cpp && \
./compile.sh --build_velox_backend=ON --velox_home=/opt/velox --enable_hdfs=ON --enable_s3=ON --enable_gcs=ON --enable_abfs=OFF'
- name: Build for Spark 3.2.2
run: |
docker exec centos8-test-$GITHUB_RUN_ID bash -c '
cd /opt/gluten && \
mvn clean install -Pspark-3.2 -Pbackends-velox -Prss -Piceberg -Pdelta -DskipTests'
- name: TPC-H SF1.0 && TPC-DS SF30.0 Parquet local spark3.2
run: |
docker exec centos8-test-$GITHUB_RUN_ID bash -c 'cd /opt/gluten/tools/gluten-it && \
mvn clean install -Pspark-3.2 \
&& GLUTEN_IT_JVM_ARGS=-Xmx5G sbin/gluten-it.sh queries-compare \
--local --preset=velox --benchmark-type=h --error-on-memleak --disable-aqe --off-heap-size=10g -s=1.0 --threads=16 --iterations=1 \
&& GLUTEN_IT_JVM_ARGS=-Xmx50G sbin/gluten-it.sh queries-compare \
--local --preset=velox --benchmark-type=ds --error-on-memleak --off-heap-size=50g -s=30.0 --threads=32 --iterations=1'
- name: TPC-H SF1.0 && TPC-DS SF30.0 Parquet local spark3.2 random kill tasks
run: |
docker exec centos8-test-$GITHUB_RUN_ID bash -c 'cd /opt/gluten/tools/gluten-it && \
mvn clean install -Pspark-3.2 \
&& GLUTEN_IT_JVM_ARGS=-Xmx5G sbin/gluten-it.sh queries \
--local --preset=velox --benchmark-type=h --error-on-memleak --disable-aqe --off-heap-size=10g -s=1.0 --threads=16 --iterations=1 --skip-data-gen --random-kill-tasks \
&& GLUTEN_IT_JVM_ARGS=-Xmx50G sbin/gluten-it.sh queries \
--local --preset=velox --benchmark-type=ds --error-on-memleak --off-heap-size=50g -s=30.0 --threads=32 --iterations=1 --skip-data-gen --random-kill-tasks'
- name: Exit docker container
if: ${{ always() }}
run: |
docker stop centos8-test-$GITHUB_RUN_ID || true
centos7-test:
runs-on: velox-self-hosted
steps:
- uses: actions/checkout@v4
- name: Setup docker container
run: |
EXTRA_DOCKER_OPTIONS="--name centos7-test-$GITHUB_RUN_ID -e NUM_THREADS=30 --detach" \
NON_INTERACTIVE=ON \
MOUNT_MAVEN_CACHE=OFF \
OS_IMAGE=centos:7 \
OS_VERSION=7 \
tools/gluten-te/centos/cbash-mount.sh sleep 14400
- name: Build Gluten velox third party
run: |
docker exec centos7-test-$GITHUB_RUN_ID bash -c '
yum -y install epel-release centos-release-scl patch sudo && \
cd /opt/gluten/ep/build-velox/src && \
./get_velox.sh --velox_home=/opt/velox --enable_hdfs=ON --enable_s3=ON --enable_gcs=ON --enable_abfs=OFF && \
./build_velox.sh --velox_home=/opt/velox --enable_ep_cache=ON --enable_s3=ON --enable_gcs=ON --enable_abfs=OFF --enable_hdfs=ON'
- name: Build Gluten CPP library
run: |
docker exec centos7-test-$GITHUB_RUN_ID bash -c '
cd /opt/gluten/cpp && \
source /opt/rh/devtoolset-9/enable && \
./compile.sh --build_velox_backend=ON --velox_home=/opt/velox --enable_hdfs=ON --enable_s3=ON --enable_gcs=ON --enable_abfs=OFF'
- name: Build for Spark 3.2.2
run: |
docker exec centos7-test-$GITHUB_RUN_ID bash -c '
cd /opt/gluten && \
mvn clean install -Pspark-3.2 -Pbackends-velox -Prss -Piceberg -Pdelta -DskipTests'
- name: TPC-H SF1.0 && TPC-DS SF30.0 Parquet local spark3.2
run: |
docker exec centos7-test-$GITHUB_RUN_ID bash -c 'cd /opt/gluten/tools/gluten-it && \
mvn clean install -Pspark-3.2 \
&& GLUTEN_IT_JVM_ARGS=-Xmx5G sbin/gluten-it.sh queries-compare \
--local --preset=velox --benchmark-type=h --error-on-memleak --disable-aqe --off-heap-size=10g -s=1.0 --threads=16 --iterations=1 \
&& GLUTEN_IT_JVM_ARGS=-Xmx50G sbin/gluten-it.sh queries-compare \
--local --preset=velox --benchmark-type=ds --error-on-memleak --off-heap-size=50g -s=30.0 --threads=32 --iterations=1'
- name: TPC-DS SF30.0 Parquet local spark3.2 Q67/Q95 low memory, memory isolation off
run: |
docker exec centos7-test-$GITHUB_RUN_ID bash -c 'cd /opt/gluten/tools/gluten-it && \
mvn clean install -Pspark-3.2 \
&& GLUTEN_IT_JVM_ARGS=-Xmx50G sbin/gluten-it.sh parameterized \
--local --preset=velox --benchmark-type=ds --error-on-memleak --queries=q67,q95 -s=30.0 --threads=12 --shuffle-partitions=72 --iterations=1 \
--skip-data-gen -m=OffHeapExecutionMemory \
-d=ISOLATION:OFF,spark.gluten.memory.isolation=false \
-d=OFFHEAP_SIZE:5g,spark.memory.offHeap.size=5g \
-d=OFFHEAP_SIZE:3g,spark.memory.offHeap.size=3g \
-d=OVER_ACQUIRE:0.3,spark.gluten.memory.overAcquiredMemoryRatio=0.3 \
-d=OVER_ACQUIRE:0.5,spark.gluten.memory.overAcquiredMemoryRatio=0.5'
- name: (To be fixed) TPC-DS SF30.0 Parquet local spark3.2 Q67/Q95 low memory, memory isolation on
run: |
docker exec centos7-test-$GITHUB_RUN_ID bash -c 'cd /opt/gluten/tools/gluten-it && \
mvn clean install -Pspark-3.2 \
&& GLUTEN_IT_JVM_ARGS=-Xmx50G sbin/gluten-it.sh parameterized \
--local --preset=velox --benchmark-type=ds --error-on-memleak --queries=q67,q95 -s=30.0 --threads=12 --shuffle-partitions=72 --iterations=1 \
--skip-data-gen -m=OffHeapExecutionMemory \
-d=ISOLATION:ON,spark.gluten.memory.isolation=true,spark.memory.storageFraction=0.1 \
-d=OFFHEAP_SIZE:5g,spark.memory.offHeap.size=5g \
-d=OFFHEAP_SIZE:3g,spark.memory.offHeap.size=3g \
-d=OVER_ACQUIRE:0.3,spark.gluten.memory.overAcquiredMemoryRatio=0.3 \
-d=OVER_ACQUIRE:0.5,spark.gluten.memory.overAcquiredMemoryRatio=0.5' || true
- name: TPC-DS SF30.0 Parquet local spark3.2 Q23A/Q23B low memory
run: |
docker exec centos7-test-$GITHUB_RUN_ID bash -c 'cd /opt/gluten/tools/gluten-it && \
GLUTEN_IT_JVM_ARGS=-Xmx50G sbin/gluten-it.sh parameterized \
--local --preset=velox --benchmark-type=ds --error-on-memleak --queries=q23a,q23b -s=30.0 --threads=12 --shuffle-partitions=72 --iterations=1 \
--skip-data-gen -m=OffHeapExecutionMemory \
-d=ISOLATION:OFF,spark.gluten.memory.isolation=false \
-d=ISOLATION:ON,spark.gluten.memory.isolation=true,spark.memory.storageFraction=0.1 \
-d=OFFHEAP_SIZE:2g,spark.memory.offHeap.size=2g \
-d=PARTIAL_MODE:ABANDONED,spark.gluten.sql.columnar.backend.velox.maxPartialAggregationMemoryRatio=1.0,spark.gluten.sql.columnar.backend.velox.maxExtendedPartialAggregationMemoryRatio=1.0,spark.gluten.sql.columnar.backend.velox.abandonPartialAggregationMinPct=0,spark.gluten.sql.columnar.backend.velox.abandonPartialAggregationMinRows=0 \
-d=PARTIAL_MODE:CACHED,spark.gluten.sql.columnar.backend.velox.maxPartialAggregationMemoryRatio=1.0,spark.gluten.sql.columnar.backend.velox.maxExtendedPartialAggregationMemoryRatio=1.0,spark.gluten.sql.columnar.backend.velox.abandonPartialAggregationMinPct=100,spark.gluten.sql.columnar.backend.velox.abandonPartialAggregationMinRows=0 \
-d=PARTIAL_MODE:FLUSHED,spark.gluten.sql.columnar.backend.velox.maxPartialAggregationMemoryRatio=0.05,spark.gluten.sql.columnar.backend.velox.maxExtendedPartialAggregationMemoryRatio=0.1,spark.gluten.sql.columnar.backend.velox.abandonPartialAggregationMinPct=100,spark.gluten.sql.columnar.backend.velox.abandonPartialAggregationMinRows=0'
- name: (To be fixed) TPC-DS SF30.0 Parquet local spark3.2 Q97 low memory # The case currently causes crash with "free: invalid size".
run: |
docker exec centos7-test-$GITHUB_RUN_ID bash -c 'cd /opt/gluten/tools/gluten-it && \
GLUTEN_IT_JVM_ARGS=-Xmx50G sbin/gluten-it.sh parameterized \
--local --preset=velox --benchmark-type=ds --error-on-memleak --queries=q97 -s=30.0 --threads=12 --shuffle-partitions=72 --iterations=1 \
--skip-data-gen -m=OffHeapExecutionMemory \
-d=ISOLATION:OFF,spark.gluten.memory.isolation=false \
-d=ISOLATION:ON,spark.gluten.memory.isolation=true,spark.memory.storageFraction=0.1 \
-d=OFFHEAP_SIZE:2g,spark.memory.offHeap.size=2g \
-d=OFFHEAP_SIZE:1g,spark.memory.offHeap.size=1g' || true
- name: Exit docker container
if: ${{ always() }}
run: |
docker stop centos7-test-$GITHUB_RUN_ID || true
static-build-test:
runs-on: velox-self-hosted
steps:
- uses: actions/checkout@v4
- name: Setup docker container
run: |
docker run --rm --init --privileged --ulimit nofile=65536:65536 --ulimit core=-1 --security-opt seccomp=unconfined \
-v $PWD:/opt/gluten --name static-build-test-$GITHUB_RUN_ID -e NUM_THREADS=30 -detach $DOCKER_REGISTRY_HOST:$DOCKER_REGISTRY_PORT/gluten-dev/centos:7 \
bash -c 'cd /opt/gluten && sleep 14400'
- name: Build Gluten CPP library
run: |
docker exec -i static-build-test-$GITHUB_RUN_ID bash -c '
source /env.sh && \
sudo yum -y install patch && \
cd /opt/gluten && \
sudo -E ./dev/vcpkg/setup-build-depends.sh && \
source ./dev/vcpkg/env.sh && \
./dev/builddeps-veloxbe.sh --enable_vcpkg=ON --build_tests=OFF --build_benchmarks=OFF --enable_s3=ON --enable_gcs=ON --enable_hdfs=ON --enable_abfs=OFF'
- name: Build for Spark 3.2.2
run: |
docker exec static-build-test-$GITHUB_RUN_ID bash -c '
cd /opt/gluten && \
mvn clean install -Pspark-3.2 -Pbackends-velox -Prss -Piceberg -Pdelta -DskipTests && \
cd /opt/gluten/tools/gluten-it && \
mvn clean install -Pspark-3.2'
- name: TPC-H SF1.0 && TPC-DS SF1.0 Parquet local spark3.2 (centos 8)
run: |
docker run --rm --init --privileged --ulimit nofile=65536:65536 --ulimit core=-1 --security-opt seccomp=unconfined \
-v $PWD:/opt/gluten --name static-build-test-$GITHUB_RUN_ID-tpc -e NUM_THREADS=30 $DOCKER_REGISTRY_HOST:$DOCKER_REGISTRY_PORT/gluten-dev/centos:8 \
bash -c 'cd /opt/gluten/tools/gluten-it \
&& GLUTEN_IT_JVM_ARGS=-Xmx5G sbin/gluten-it.sh queries-compare \
--local --preset=velox --benchmark-type=h --error-on-memleak --disable-aqe --off-heap-size=10g -s=1.0 --threads=16 --iterations=1 \
&& GLUTEN_IT_JVM_ARGS=-Xmx10G sbin/gluten-it.sh queries-compare \
--local --preset=velox --benchmark-type=ds --error-on-memleak --off-heap-size=20g -s=1.0 --threads=32 --iterations=1'
- name: TPC-H SF1.0 && TPC-DS SF1.0 Parquet local spark3.2 (ubuntu 20.04)
run: |
docker run --rm --init --privileged --ulimit nofile=65536:65536 --ulimit core=-1 --security-opt seccomp=unconfined \
-v $PWD:/opt/gluten --name static-build-test-$GITHUB_RUN_ID-tpc -e NUM_THREADS=30 $DOCKER_REGISTRY_HOST:$DOCKER_REGISTRY_PORT/gluten-dev/ubuntu:20.04 \
'cd /opt/gluten/tools/gluten-it \
&& GLUTEN_IT_JVM_ARGS=-Xmx5G sbin/gluten-it.sh queries-compare \
--local --preset=velox --benchmark-type=h --error-on-memleak --disable-aqe --off-heap-size=10g -s=1.0 --threads=16 --iterations=1 \
&& GLUTEN_IT_JVM_ARGS=-Xmx10G sbin/gluten-it.sh queries-compare \
--local --preset=velox --benchmark-type=ds --error-on-memleak --off-heap-size=20g -s=1.0 --threads=32 --iterations=1'
- name: TPC-H SF1.0 && TPC-DS SF1.0 Parquet local spark3.2 (ubuntu 22.04)
run: |
docker run --rm --init --privileged --ulimit nofile=65536:65536 --ulimit core=-1 --security-opt seccomp=unconfined \
-v $PWD:/opt/gluten --name static-build-test-$GITHUB_RUN_ID-tpc -e NUM_THREADS=30 $DOCKER_REGISTRY_HOST:$DOCKER_REGISTRY_PORT/gluten-dev/ubuntu:22.04 \
'cd /opt/gluten/tools/gluten-it \
&& GLUTEN_IT_JVM_ARGS=-Xmx5G sbin/gluten-it.sh queries-compare \
--local --preset=velox --benchmark-type=h --error-on-memleak --disable-aqe --off-heap-size=10g -s=1.0 --threads=16 --iterations=1 \
&& GLUTEN_IT_JVM_ARGS=-Xmx10G sbin/gluten-it.sh queries-compare \
--local --preset=velox --benchmark-type=ds --error-on-memleak --off-heap-size=20g -s=1.0 --threads=32 --iterations=1'
- name: Exit docker container
if: ${{ always() }}
run: |
docker stop static-build-test-$GITHUB_RUN_ID || true
build-script-test:
runs-on: velox-self-hosted
steps:
- uses: actions/checkout@v4
- name: Setup docker container
run: |
docker run --rm --init --privileged --ulimit nofile=65536:65536 --ulimit core=-1 --security-opt seccomp=unconfined \
-v $PWD:/opt/gluten --name build-script-test-$GITHUB_RUN_ID -e NUM_THREADS=30 -detach 10.0.2.4:5000/gluten-dev/centos:8 \
bash -c 'cd /opt/gluten && sleep 14400'
- name: Build Script Test
run: |
docker exec build-script-test-$GITHUB_RUN_ID bash -c '
source /env.sh && \
sudo yum -y install patch && \
cd /opt/gluten/ && \
./dev/package.sh'
- name: Exit docker container
if: ${{ always() }}
run: |
docker stop build-script-test-$GITHUB_RUN_ID || true