Skip to content

Commit

Permalink
Merge branch 'main' of https://github.com/gaoyangxiaozhu/gluten into …
Browse files Browse the repository at this point in the history
…gayangya/split_refactor
  • Loading branch information
gaoyangxiaozhu committed Jul 2, 2024
2 parents 33cf50e + eb1b913 commit ab38690
Show file tree
Hide file tree
Showing 11 changed files with 33 additions and 65 deletions.
15 changes: 7 additions & 8 deletions .github/workflows/velox_docker.yml
Original file line number Diff line number Diff line change
Expand Up @@ -335,8 +335,7 @@ jobs:
-d=FLUSH_MODE:DISABLED,spark.gluten.sql.columnar.backend.velox.flushablePartialAggregation=false,spark.gluten.sql.columnar.backend.velox.maxPartialAggregationMemoryRatio=1.0,spark.gluten.sql.columnar.backend.velox.maxExtendedPartialAggregationMemoryRatio=1.0,spark.gluten.sql.columnar.backend.velox.abandonPartialAggregationMinPct=100,spark.gluten.sql.columnar.backend.velox.abandonPartialAggregationMinRows=0 \
-d=FLUSH_MODE:ABANDONED,spark.gluten.sql.columnar.backend.velox.maxPartialAggregationMemoryRatio=1.0,spark.gluten.sql.columnar.backend.velox.maxExtendedPartialAggregationMemoryRatio=1.0,spark.gluten.sql.columnar.backend.velox.abandonPartialAggregationMinPct=0,spark.gluten.sql.columnar.backend.velox.abandonPartialAggregationMinRows=0 \
-d=FLUSH_MODE:FLUSHED,spark.gluten.sql.columnar.backend.velox.maxPartialAggregationMemoryRatio=0.05,spark.gluten.sql.columnar.backend.velox.maxExtendedPartialAggregationMemoryRatio=0.1,spark.gluten.sql.columnar.backend.velox.abandonPartialAggregationMinPct=100,spark.gluten.sql.columnar.backend.velox.abandonPartialAggregationMinRows=0
- name: (To be fixed) TPC-DS SF30.0 Parquet local spark3.2 Q23A/Q23B low memory, memory isolation on
if: false # Disabled as error https://gist.github.com/zhztheplayer/abd5e83ccdc48730678ae7ebae479fcc
- name: (To be fixed) TPC-DS SF30.0 Parquet local spark3.2 Q23A/Q23B low memory, memory isolation on # Disabled as error https://gist.github.com/zhztheplayer/abd5e83ccdc48730678ae7ebae479fcc
run: |
cd tools/gluten-it \
&& GLUTEN_IT_JVM_ARGS=-Xmx3G sbin/gluten-it.sh parameterized \
Expand All @@ -346,8 +345,8 @@ jobs:
-d=OFFHEAP_SIZE:2g,spark.memory.offHeap.size=2g \
-d=FLUSH_MODE:DISABLED,spark.gluten.sql.columnar.backend.velox.flushablePartialAggregation=false,spark.gluten.sql.columnar.backend.velox.maxPartialAggregationMemoryRatio=1.0,spark.gluten.sql.columnar.backend.velox.maxExtendedPartialAggregationMemoryRatio=1.0,spark.gluten.sql.columnar.backend.velox.abandonPartialAggregationMinPct=100,spark.gluten.sql.columnar.backend.velox.abandonPartialAggregationMinRows=0 \
-d=FLUSH_MODE:ABANDONED,spark.gluten.sql.columnar.backend.velox.maxPartialAggregationMemoryRatio=1.0,spark.gluten.sql.columnar.backend.velox.maxExtendedPartialAggregationMemoryRatio=1.0,spark.gluten.sql.columnar.backend.velox.abandonPartialAggregationMinPct=0,spark.gluten.sql.columnar.backend.velox.abandonPartialAggregationMinRows=0 \
-d=FLUSH_MODE:FLUSHED,spark.gluten.sql.columnar.backend.velox.maxPartialAggregationMemoryRatio=0.05,spark.gluten.sql.columnar.backend.velox.maxExtendedPartialAggregationMemoryRatio=0.1,spark.gluten.sql.columnar.backend.velox.abandonPartialAggregationMinPct=100,spark.gluten.sql.columnar.backend.velox.abandonPartialAggregationMinRows=0
- name: (To be fixed) TPC-DS SF30.0 Parquet local spark3.2 Q97 low memory # The case currently causes crash with "free: invalid size".
-d=FLUSH_MODE:FLUSHED,spark.gluten.sql.columnar.backend.velox.maxPartialAggregationMemoryRatio=0.05,spark.gluten.sql.columnar.backend.velox.maxExtendedPartialAggregationMemoryRatio=0.1,spark.gluten.sql.columnar.backend.velox.abandonPartialAggregationMinPct=100,spark.gluten.sql.columnar.backend.velox.abandonPartialAggregationMinRows=0 || true
- name: TPC-DS SF30.0 Parquet local spark3.2 Q97 low memory
run: |
cd tools/gluten-it \
&& GLUTEN_IT_JVM_ARGS=-Xmx3G sbin/gluten-it.sh parameterized \
Expand Down Expand Up @@ -617,6 +616,10 @@ jobs:
install_arrow_deps
./dev/builddeps-veloxbe.sh --run_setup_script=OFF --enable_ep_cache=OFF --build_tests=ON \
--build_examples=ON --build_benchmarks=ON --build_protobuf=ON
- name: Gluten CPP Test
run: |
cd ./cpp/build && \
ctest -V
- uses: actions/upload-artifact@v2
with:
name: velox-native-lib-centos-8-${{github.sha}}
Expand Down Expand Up @@ -682,10 +685,6 @@ jobs:
working-directory: ${{ github.workspace }}
run: |
mkdir -p '${{ env.CCACHE_DIR }}'
- name: Gluten CPP Test
run: |
cd $GITHUB_WORKSPACE/cpp/build && \
ctest -V
- name: Prepare spark.test.home for Spark 3.2.2 (other tests)
run: |
cd $GITHUB_WORKSPACE/ && \
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -612,13 +612,6 @@ class CHSparkPlanExecApi extends SparkPlanExecApi {
CHStringTranslateTransformer(substraitExprName, srcExpr, matchingExpr, replaceExpr, original)
}

override def genSizeExpressionTransformer(
substraitExprName: String,
child: ExpressionTransformer,
original: Size): ExpressionTransformer = {
CHSizeExpressionTransformer(substraitExprName, child, original)
}

override def genLikeTransformer(
substraitExprName: String,
left: ExpressionTransformer,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,16 +30,6 @@ import com.google.common.collect.Lists

import java.util.Locale

case class CHSizeExpressionTransformer(
substraitExprName: String,
expr: ExpressionTransformer,
original: Size)
extends BinaryExpressionTransformer {
override def left: ExpressionTransformer = expr
// Pass legacyLiteral as second argument in substrait function
override def right: ExpressionTransformer = LiteralTransformer(original.legacySizeOfNull)
}

case class CHTruncTimestampTransformer(
substraitExprName: String,
format: ExpressionTransformer,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,12 @@ class ScalarFunctionsValidateSuite extends FunctionsValidateTest {
}
}

testWithSpecifiedSparkVersion("null input for array_size", Some("3.3")) {
runQueryAndCompare("SELECT array_size(null)") {
checkGlutenOperatorMatch[ProjectExecTransformer]
}
}

test("chr function") {
val df = runQueryAndCompare(
"SELECT chr(l_orderkey + 64) " +
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1017,7 +1017,7 @@ class TestOperator extends VeloxWholeStageTransformerSuite with AdaptiveSparkPla
}
}

ignore("test explode/posexplode function") {
test("test explode/posexplode function") {
Seq("explode", "posexplode").foreach {
func =>
// Literal: func(literal)
Expand Down Expand Up @@ -1190,7 +1190,7 @@ class TestOperator extends VeloxWholeStageTransformerSuite with AdaptiveSparkPla
|""".stripMargin)(_)
}

ignore("test multi-generate") {
test("test multi-generate") {
withTable("t") {
sql("CREATE TABLE t (col1 array<struct<a int, b string>>, col2 array<int>) using parquet")
sql("INSERT INTO t VALUES (array(struct(1, 'a'), struct(2, 'b')), array(1, 2))")
Expand Down Expand Up @@ -1588,7 +1588,7 @@ class TestOperator extends VeloxWholeStageTransformerSuite with AdaptiveSparkPla
}
}

ignore("test array literal") {
test("test array literal") {
withTable("array_table") {
sql("create table array_table(a array<bigint>) using parquet")
sql("insert into table array_table select array(1)")
Expand All @@ -1601,7 +1601,7 @@ class TestOperator extends VeloxWholeStageTransformerSuite with AdaptiveSparkPla
}
}

ignore("test map literal") {
test("test map literal") {
withTable("map_table") {
sql("create table map_table(a map<bigint, string>) using parquet")
sql("insert into table map_table select map(1, 'hello')")
Expand Down
2 changes: 2 additions & 0 deletions cpp/velox/tests/BufferOutputStreamTest.cc
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
*/

#include "memory/BufferOutputStream.h"
#include "compute/VeloxBackend.h"
#include "memory/VeloxColumnarBatch.h"
#include "velox/common/memory/ByteStream.h"
#include "velox/vector/tests/utils/VectorTestBase.h"
Expand All @@ -27,6 +28,7 @@ class BufferOutputStreamTest : public ::testing::Test, public test::VectorTestBa
protected:
// Velox requires the mem manager to be instanced.
static void SetUpTestCase() {
VeloxBackend::create({});
memory::MemoryManager::testingSetInstance({});
}

Expand Down
4 changes: 2 additions & 2 deletions dev/builddeps-veloxbe.sh
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ BUILD_TESTS=OFF
BUILD_EXAMPLES=OFF
BUILD_BENCHMARKS=OFF
BUILD_JEMALLOC=OFF
BUILD_PROTOBUF=ON
BUILD_PROTOBUF=OFF
BUILD_VELOX_TESTS=OFF
BUILD_VELOX_BENCHMARKS=OFF
ENABLE_QAT=OFF
Expand Down Expand Up @@ -201,7 +201,7 @@ function build_arrow {
function build_velox {
echo "Start to build Velox"
cd $GLUTEN_DIR/ep/build-velox/src
./get_velox.sh --enable_hdfs=$ENABLE_HDFS --build_protobuf=$BUILD_PROTOBUF --enable_s3=$ENABLE_S3 --enable_gcs=$ENABLE_GCS --enable_abfs=$ENABLE_ABFS $VELOX_PARAMETER
./get_velox.sh --enable_hdfs=$ENABLE_HDFS --enable_s3=$ENABLE_S3 --enable_gcs=$ENABLE_GCS --enable_abfs=$ENABLE_ABFS $VELOX_PARAMETER
# When BUILD_TESTS is on for gluten cpp, we need turn on VELOX_BUILD_TEST_UTILS via build_test_utils.
./build_velox.sh --run_setup_script=$RUN_SETUP_SCRIPT --enable_s3=$ENABLE_S3 --enable_gcs=$ENABLE_GCS --build_type=$BUILD_TYPE --enable_hdfs=$ENABLE_HDFS \
--enable_abfs=$ENABLE_ABFS --enable_ep_cache=$ENABLE_EP_CACHE --build_test_utils=$BUILD_TESTS --build_tests=$BUILD_VELOX_TESTS --build_benchmarks=$BUILD_VELOX_BENCHMARKS \
Expand Down
4 changes: 2 additions & 2 deletions docs/get-started/build-guide.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,8 @@ Please set them via `--`, e.g. `--build_type=Release`.
| build_tests | Build gluten cpp tests. | OFF |
| build_examples | Build udf example. | OFF |
| build_benchmarks | Build gluten cpp benchmarks. | OFF |
| build_jemalloc | Build with jemalloc. | OFF |
| build_protobuf | Build protobuf lib. | ON |
| build_jemalloc | Build with jemalloc. | OFF |
| build_protobuf | Build protobuf lib. | OFF |
| enable_qat | Enable QAT for shuffle data de/compression. | OFF |
| enable_iaa | Enable IAA for shuffle data de/compression. | OFF |
| enable_hbm | Enable HBM allocator. | OFF |
Expand Down
23 changes: 5 additions & 18 deletions ep/build-velox/src/get_velox.sh
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,6 @@ VELOX_HOME=""

#Set on run gluten on HDFS
ENABLE_HDFS=OFF
#It can be set to OFF when compiling velox again
BUILD_PROTOBUF=ON
#Set on run gluten on S3
ENABLE_S3=OFF
#Set on run gluten on GCS
Expand All @@ -47,10 +45,6 @@ for arg in "$@"; do
VELOX_HOME=("${arg#*=}")
shift # Remove argument name from processing
;;
--build_protobuf=*)
BUILD_PROTOBUF=("${arg#*=}")
shift # Remove argument name from processing
;;
--enable_hdfs=*)
ENABLE_HDFS=("${arg#*=}")
shift # Remove argument name from processing
Expand Down Expand Up @@ -95,15 +89,13 @@ function process_setup_ubuntu {
sed -i '/ccache/a\ curl \\' scripts/setup-ubuntu.sh
sed -i '/libgmock-dev/d' scripts/setup-ubuntu.sh # resolved by ep/build-velox/build/velox_ep/CMake/resolve_dependency_modules/gtest.cmake
sed -i 's/github_checkout boostorg\/boost \"\${BOOST_VERSION}\" --recursive/wget_and_untar https:\/\/github.com\/boostorg\/boost\/releases\/download\/boost-1.84.0\/boost-1.84.0.tar.gz boost \&\& cd boost/g' scripts/setup-ubuntu.sh
sed -i '/^function install_folly.*/i function install_protobuf {\n wget https://github.com/protocolbuffers/protobuf/releases/download/v21.4/protobuf-all-21.4.tar.gz\n tar -xzf protobuf-all-21.4.tar.gz\n cd protobuf-21.4\n ./configure CXXFLAGS="-fPIC" --prefix=/usr/local\n make "-j$(nproc)"\n sudo make install\n sudo ldconfig\n}\n' scripts/setup-ubuntu.sh
sed -i '/^ run_and_time install_folly/a \ \ run_and_time install_protobuf' scripts/setup-ubuntu.sh
if [ $ENABLE_HDFS == "ON" ]; then
sed -i '/^function install_folly.*/i function install_libhdfs3 {\n github_checkout oap-project/libhdfs3 master \n cmake_install\n}\n' scripts/setup-ubuntu.sh
sed -i '/^ run_and_time install_folly/a \ \ run_and_time install_libhdfs3' scripts/setup-ubuntu.sh
sed -i '/ccache /a\ yasm \\' scripts/setup-ubuntu.sh
fi
if [ $BUILD_PROTOBUF == "ON" ]; then
sed -i '/^function install_folly.*/i function install_protobuf {\n wget https://github.com/protocolbuffers/protobuf/releases/download/v21.4/protobuf-all-21.4.tar.gz\n tar -xzf protobuf-all-21.4.tar.gz\n cd protobuf-21.4\n ./configure CXXFLAGS="-fPIC" --prefix=/usr/local\n make "-j$(nproc)"\n sudo make install\n sudo ldconfig\n}\n' scripts/setup-ubuntu.sh
sed -i '/^ run_and_time install_folly/a \ \ run_and_time install_protobuf' scripts/setup-ubuntu.sh
fi
sed -i "s/apt install -y/sudo apt install -y/" ${VELOX_HOME}/scripts/setup-adapters.sh
if [ $ENABLE_S3 == "ON" ]; then
sed -i '/^ run_and_time install_folly/a \ \ '${VELOX_HOME}/scripts'/setup-adapters.sh aws' scripts/setup-ubuntu.sh
Expand Down Expand Up @@ -136,15 +128,14 @@ function process_setup_centos8 {
sed -i '/^dnf_install autoconf/a\dnf_install libxml2-devel libgsasl-devel libuuid-devel' scripts/setup-centos8.sh
sed -i '/^function install_gflags.*/i function install_openssl {\n wget_and_untar https://github.com/openssl/openssl/archive/refs/tags/OpenSSL_1_1_1s.tar.gz openssl \n cd openssl \n ./config no-shared && make depend && make && sudo make install \n cd ..\n}\n' scripts/setup-centos8.sh
sed -i '/^ run_and_time install_fbthrift/a \ run_and_time install_openssl' scripts/setup-centos8.sh
sed -i '/cd protobuf/{n;s/\.\/configure --prefix=\/usr/\.\/configure CXXFLAGS="-fPIC" --prefix=\/usr\/local/;}' scripts/setup-centos8.sh

if [ $ENABLE_HDFS == "ON" ]; then
sed -i '/^function install_gflags.*/i function install_libhdfs3 {\n cd "\${DEPENDENCY_DIR}"\n github_checkout oap-project/libhdfs3 master\n cmake_install\n}\n' scripts/setup-centos8.sh
sed -i '/^ run_and_time install_fbthrift/a \ run_and_time install_libhdfs3' scripts/setup-centos8.sh
sed -i '/^ dnf_install ninja-build/a\ dnf_install yasm\' scripts/setup-centos8.sh
fi
if [[ $BUILD_PROTOBUF == "ON" ]] || [[ $ENABLE_HDFS == "ON" ]]; then
sed -i '/cd protobuf/{n;s/\.\/configure --prefix=\/usr/\.\/configure CXXFLAGS="-fPIC" --prefix=\/usr\/local/;}' scripts/setup-centos8.sh
fi

sed -i "s/yum -y install/sudo yum -y install/" ${VELOX_HOME}/scripts/setup-adapters.sh
if [ $ENABLE_S3 == "ON" ]; then
sed -i '/^ run_and_time install_fbthrift/a \ \ '${VELOX_HOME}/scripts'/setup-adapters.sh aws' scripts/setup-centos8.sh
Expand Down Expand Up @@ -172,15 +163,12 @@ function process_setup_centos7 {

# install gtest
sed -i '/^ run_and_time install_folly/a \ \ run_and_time install_gtest' scripts/setup-centos7.sh

sed -i '/^ run_and_time install_folly/a \ \ run_and_time install_protobuf' scripts/setup-centos7.sh
if [ $ENABLE_HDFS = "ON" ]; then
sed -i '/^function install_protobuf.*/i function install_libhdfs3 {\n cd "\${DEPENDENCY_DIR}"\n github_checkout oap-project/libhdfs3 master \n cmake_install\n}\n' scripts/setup-centos7.sh
sed -i '/^ run_and_time install_folly/a \ \ run_and_time install_libhdfs3' scripts/setup-centos7.sh
sed -i '/^dnf_install ccache/a\ \ yasm \\' scripts/setup-centos7.sh
fi
if [[ $BUILD_PROTOBUF == "ON" ]] || [[ $ENABLE_HDFS == "ON" ]]; then
sed -i '/^ run_and_time install_folly/a \ \ run_and_time install_protobuf' scripts/setup-centos7.sh
fi
sed -i "s/yum -y install/sudo yum -y install/" ${VELOX_HOME}/scripts/setup-adapters.sh
if [ $ENABLE_S3 == "ON" ]; then
sed -i '/^ run_and_time install_folly/a \ \ '${VELOX_HOME}/scripts'/setup-adapters.sh aws' scripts/setup-centos7.sh
Expand Down Expand Up @@ -219,7 +207,6 @@ function process_setup_tencentos32 {

echo "Preparing Velox source code..."
echo "ENABLE_HDFS=${ENABLE_HDFS}"
echo "BUILD_PROTOBUF=${BUILD_PROTOBUF}"

CURRENT_DIR=$(
cd "$(dirname "$BASH_SOURCE")"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -464,13 +464,6 @@ trait SparkPlanExecApi {
original)
}

def genSizeExpressionTransformer(
substraitExprName: String,
child: ExpressionTransformer,
original: Size): ExpressionTransformer = {
GenericExpressionTransformer(substraitExprName, Seq(child), original)
}

def genLikeTransformer(
substraitExprName: String,
left: ExpressionTransformer,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -396,14 +396,12 @@ object ExpressionConverter extends SQLConfHelper with Logging {
r
)
case size: Size =>
if (size.legacySizeOfNull != SQLConf.get.legacySizeOfNull) {
throw new GlutenNotSupportException(
"The value of legacySizeOfNull field of size is " +
"not equals to legacySizeOfNull of SQLConf, this case is not supported yet")
}
BackendsApiManager.getSparkPlanExecApiInstance.genSizeExpressionTransformer(
// Covers Spark ArraySize which is replaced by Size(child, false).
val child =
replaceWithExpressionTransformerInternal(size.child, attributeSeq, expressionsMap)
GenericExpressionTransformer(
substraitExprName,
replaceWithExpressionTransformerInternal(size.child, attributeSeq, expressionsMap),
Seq(child, LiteralTransformer(size.legacySizeOfNull)),
size)
case namedStruct: CreateNamedStruct =>
BackendsApiManager.getSparkPlanExecApiInstance.genNamedStructTransformer(
Expand Down

0 comments on commit ab38690

Please sign in to comment.