From 1b8be1dee182fe8d6a756725e5a29884613d6d10 Mon Sep 17 00:00:00 2001 From: PHILO-HE Date: Fri, 28 Jun 2024 12:57:31 +0800 Subject: [PATCH] [VL] Support building arrow CPP and finding installed arrow libs from system (#6229) --- .github/workflows/velox_docker.yml | 10 +- cpp/CMake/ConfigArrow.cmake | 5 +- dev/build_arrow.sh | 92 ++++++++++++++ dev/build_helper_functions.sh | 118 ++++++++++++++++++ dev/builddeps-veloxbe.sh | 29 +++-- .../src/build_arrow_deps_centos8.sh | 45 ------- ep/build-velox/src/build_velox.sh | 39 +----- 7 files changed, 239 insertions(+), 99 deletions(-) create mode 100755 dev/build_arrow.sh create mode 100644 dev/build_helper_functions.sh delete mode 100755 ep/build-velox/src/build_arrow_deps_centos8.sh diff --git a/.github/workflows/velox_docker.yml b/.github/workflows/velox_docker.yml index d110d0a6d223..fd937f6c1d49 100644 --- a/.github/workflows/velox_docker.yml +++ b/.github/workflows/velox_docker.yml @@ -600,13 +600,11 @@ jobs: - name: Build Gluten Velox third party if: ${{ steps.cache.outputs.cache-hit != 'true' }} run: | - cd ep/build-velox/src - ./get_velox.sh source /opt/rh/gcc-toolset-9/enable - ./build_arrow_deps_centos8.sh - ./build_velox.sh --run_setup_script=OFF --enable_ep_cache=OFF --build_test_utils=ON - cd $GITHUB_WORKSPACE/cpp - ./compile.sh --build_velox_backend=ON --build_protobuf=ON --build_tests=ON --build_examples=ON --build_benchmarks=ON + source ./dev/build_arrow.sh + install_arrow_deps + ./dev/builddeps-veloxbe.sh --run_setup_script=OFF --enable_ep_cache=OFF --build_tests=ON \ + --build_examples=ON --build_benchmarks=ON --build_protobuf=ON - uses: actions/upload-artifact@v2 with: name: velox-native-lib-centos-8-${{github.sha}} diff --git a/cpp/CMake/ConfigArrow.cmake b/cpp/CMake/ConfigArrow.cmake index 8f036be53411..110836347cac 100644 --- a/cpp/CMake/ConfigArrow.cmake +++ b/cpp/CMake/ConfigArrow.cmake @@ -33,11 +33,12 @@ function(FIND_ARROW_LIB LIB_NAME) set(ARROW_LIB_FULL_NAME ${CMAKE_SHARED_LIBRARY_PREFIX}${LIB_NAME}${ARROW_SHARED_LIBRARY_SUFFIX}) add_library(Arrow::${LIB_NAME} SHARED IMPORTED) + # Firstly find the lib from velox's arrow build path. If not found, try to + # find it from system. find_library( ARROW_LIB_${LIB_NAME} NAMES ${ARROW_LIB_FULL_NAME} - PATHS ${ARROW_LIB_DIR} ${ARROW_LIB64_DIR} - NO_DEFAULT_PATH) + PATHS ${ARROW_LIB_DIR} ${ARROW_LIB64_DIR}) if(NOT ARROW_LIB_${LIB_NAME}) message(FATAL_ERROR "Arrow library Not Found: ${ARROW_LIB_FULL_NAME}") else() diff --git a/dev/build_arrow.sh b/dev/build_arrow.sh new file mode 100755 index 000000000000..a822c4119ea0 --- /dev/null +++ b/dev/build_arrow.sh @@ -0,0 +1,92 @@ +#!/bin/bash +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +CURRENT_DIR=$(cd "$(dirname "$BASH_SOURCE")"; pwd) +source ${CURRENT_DIR}/build_helper_functions.sh +VELOX_ARROW_BUILD_VERSION=15.0.0 +ARROW_PREFIX=$CURRENT_DIR/arrow_ep +# Always uses BUNDLED in case of that thrift is not installed. +THRIFT_SOURCE="BUNDLED" +BUILD_TYPE=Release + +function prepare_arrow_build() { + sudo rm -rf arrow_ep/ + wget_and_untar https://archive.apache.org/dist/arrow/arrow-${VELOX_ARROW_BUILD_VERSION}/apache-arrow-${VELOX_ARROW_BUILD_VERSION}.tar.gz arrow_ep + cd arrow_ep/ + patch -p1 < $CURRENT_DIR/../ep/build-velox/src/modify_arrow.patch + patch -p1 < $CURRENT_DIR/../ep/build-velox/src/modify_arrow_dataset_scan_option.patch +} + +function install_arrow_deps { + wget_and_untar https://github.com/openssl/openssl/archive/refs/tags/OpenSSL_1_1_1s.tar.gz openssl + pushd openssl + ./config no-shared && make depend && make && sudo make install + popd +} + +function build_arrow_cpp() { + if [ -n "$1" ]; then + BUILD_TYPE=$1 + fi + pushd $ARROW_PREFIX/cpp + + cmake_install \ + -DARROW_PARQUET=ON \ + -DARROW_FILESYSTEM=ON \ + -DARROW_PROTOBUF_USE_SHARED=OFF \ + -DARROW_WITH_THRIFT=ON \ + -DARROW_WITH_LZ4=ON \ + -DARROW_WITH_SNAPPY=ON \ + -DARROW_WITH_ZLIB=ON \ + -DARROW_WITH_ZSTD=ON \ + -DARROW_JEMALLOC=OFF \ + -DARROW_SIMD_LEVEL=NONE \ + -DARROW_RUNTIME_SIMD_LEVEL=NONE \ + -DARROW_WITH_UTF8PROC=OFF \ + -DARROW_TESTING=ON \ + -DCMAKE_INSTALL_PREFIX=/usr/local \ + -DCMAKE_BUILD_TYPE=${BUILD_TYPE} \ + -DARROW_BUILD_STATIC=ON \ + -DThrift_SOURCE=${THRIFT_SOURCE} + popd +} + +function build_arrow_java() { + ARROW_INSTALL_DIR="${ARROW_PREFIX}/install" + + pushd $ARROW_PREFIX/java + # Because arrow-bom module need the -DprocessAllModules + mvn versions:set -DnewVersion=15.0.0-gluten -DprocessAllModules + + mvn clean install -pl bom,maven/module-info-compiler-maven-plugin,vector -am \ + -DskipTests -Drat.skip -Dmaven.gitcommitid.skip -Dcheckstyle.skip -Dassembly.skipAssembly + + # Arrow C Data Interface CPP libraries + mvn generate-resources -P generate-libs-cdata-all-os -Darrow.c.jni.dist.dir=$ARROW_INSTALL_DIR \ + -Dmaven.test.skip -Drat.skip -Dmaven.gitcommitid.skip -Dcheckstyle.skip -N + + # Arrow JNI Date Interface CPP libraries + export PKG_CONFIG_PATH=/usr/local/lib64/pkgconfig${PKG_CONFIG_PATH:+:${PKG_CONFIG_PATH}} + mvn generate-resources -Pgenerate-libs-jni-macos-linux -N -Darrow.dataset.jni.dist.dir=$ARROW_INSTALL_DIR \ + -DARROW_GANDIVA=OFF -DARROW_JAVA_JNI_ENABLE_GANDIVA=OFF -DARROW_ORC=OFF -DARROW_JAVA_JNI_ENABLE_ORC=OFF \ + -Dmaven.test.skip -Drat.skip -Dmaven.gitcommitid.skip -Dcheckstyle.skip -N + + # Arrow Java libraries + mvn install -Parrow-jni -P arrow-c-data -pl c,dataset -am \ + -Darrow.c.jni.dist.dir=$ARROW_INSTALL_DIR/lib -Darrow.dataset.jni.dist.dir=$ARROW_INSTALL_DIR/lib -Darrow.cpp.build.dir=$ARROW_INSTALL_DIR/lib \ + -Dmaven.test.skip -Drat.skip -Dmaven.gitcommitid.skip -Dcheckstyle.skip -Dassembly.skipAssembly + popd +} diff --git a/dev/build_helper_functions.sh b/dev/build_helper_functions.sh new file mode 100644 index 000000000000..221f01bf6fb5 --- /dev/null +++ b/dev/build_helper_functions.sh @@ -0,0 +1,118 @@ +#!/bin/bash +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +function get_cxx_flags { + local CPU_ARCH=$1 + + local OS + OS=$(uname) + local MACHINE + MACHINE=$(uname -m) + ADDITIONAL_FLAGS="" + + if [[ -z "$CPU_ARCH" ]] || [[ $CPU_ARCH == "unknown" ]]; then + if [ "$OS" = "Darwin" ]; then + + if [ "$MACHINE" = "x86_64" ]; then + local CPU_CAPABILITIES + CPU_CAPABILITIES=$(sysctl -a | grep machdep.cpu.features | awk '{print tolower($0)}') + + if [[ $CPU_CAPABILITIES =~ "avx" ]]; then + CPU_ARCH="avx" + else + CPU_ARCH="sse" + fi + + elif [[ $(sysctl -a | grep machdep.cpu.brand_string) =~ "Apple" ]]; then + # Apple silicon. + CPU_ARCH="arm64" + fi + + # On MacOs prevent the flood of translation visibility settings warnings. + ADDITIONAL_FLAGS="-fvisibility=hidden -fvisibility-inlines-hidden" + else [ "$OS" = "Linux" ]; + + local CPU_CAPABILITIES + CPU_CAPABILITIES=$(cat /proc/cpuinfo | grep flags | head -n 1| awk '{print tolower($0)}') + + if [[ "$CPU_CAPABILITIES" =~ "avx" ]]; then + CPU_ARCH="avx" + elif [[ "$CPU_CAPABILITIES" =~ "sse" ]]; then + CPU_ARCH="sse" + elif [ "$MACHINE" = "aarch64" ]; then + CPU_ARCH="aarch64" + fi + fi + fi + + case $CPU_ARCH in + + "arm64") + echo -n "-mcpu=apple-m1+crc -std=c++17 -fvisibility=hidden $ADDITIONAL_FLAGS" + ;; + + "avx") + echo -n "-mavx2 -mfma -mavx -mf16c -mlzcnt -std=c++17 -mbmi2 $ADDITIONAL_FLAGS" + ;; + + "sse") + echo -n "-msse4.2 -std=c++17 $ADDITIONAL_FLAGS" + ;; + + "aarch64") + echo -n "-mcpu=neoverse-n1 -std=c++17 $ADDITIONAL_FLAGS" + ;; + *) + echo -n "Architecture not supported!" + esac + +} + +function wget_and_untar { + local URL=$1 + local DIR=$2 + mkdir -p "${DIR}" + pushd "${DIR}" + curl -L "${URL}" > $2.tar.gz + tar -xz --strip-components=1 -f $2.tar.gz + popd +} + +function cmake_install { + local NAME=$(basename "$(pwd)") + local BINARY_DIR=_build + SUDO="${SUDO:-""}" + if [ -d "${BINARY_DIR}" ] && prompt "Do you want to rebuild ${NAME}?"; then + ${SUDO} rm -rf "${BINARY_DIR}" + fi + mkdir -p "${BINARY_DIR}" + CPU_TARGET="${CPU_TARGET:-unknown}" + COMPILER_FLAGS=$(get_cxx_flags $CPU_TARGET) + + # CMAKE_POSITION_INDEPENDENT_CODE is required so that Velox can be built into dynamic libraries \ + cmake -Wno-dev -B"${BINARY_DIR}" \ + -GNinja \ + -DCMAKE_POSITION_INDEPENDENT_CODE=ON \ + -DCMAKE_CXX_STANDARD=17 \ + "${INSTALL_PREFIX+-DCMAKE_PREFIX_PATH=}${INSTALL_PREFIX-}" \ + "${INSTALL_PREFIX+-DCMAKE_INSTALL_PREFIX=}${INSTALL_PREFIX-}" \ + -DCMAKE_CXX_FLAGS="$COMPILER_FLAGS" \ + -DBUILD_TESTING=OFF \ + "$@" + + cmake --build "${BINARY_DIR}" + ${SUDO} cmake --install "${BINARY_DIR}" +} diff --git a/dev/builddeps-veloxbe.sh b/dev/builddeps-veloxbe.sh index 35976d37a036..d5e33e926d82 100755 --- a/dev/builddeps-veloxbe.sh +++ b/dev/builddeps-veloxbe.sh @@ -24,14 +24,13 @@ ENABLE_S3=OFF ENABLE_HDFS=OFF ENABLE_ABFS=OFF ENABLE_EP_CACHE=OFF -ARROW_ENABLE_CUSTOM_CODEC=OFF ENABLE_VCPKG=OFF RUN_SETUP_SCRIPT=ON VELOX_REPO="" VELOX_BRANCH="" VELOX_HOME="" VELOX_PARAMETER="" -COMPILE_ARROW_JAVA=ON +BUILD_ARROW=ON SPARK_VERSION=ALL # set default number of threads as cpu cores minus 2 @@ -72,12 +71,10 @@ do ;; --enable_qat=*) ENABLE_QAT=("${arg#*=}") - ARROW_ENABLE_CUSTOM_CODEC=("${arg#*=}") shift # Remove argument name from processing ;; --enable_iaa=*) ENABLE_IAA=("${arg#*=}") - ARROW_ENABLE_CUSTOM_CODEC=("${arg#*=}") shift # Remove argument name from processing ;; --enable_hbm=*) @@ -136,8 +133,8 @@ do BUILD_VELOX_BENCHMARKS=("${arg#*=}") shift # Remove argument name from processing ;; - --compile_arrow_java=*) - COMPILE_ARROW_JAVA=("${arg#*=}") + --build_arrow=*) + BUILD_ARROW=("${arg#*=}") shift # Remove argument name from processing ;; --num_threads=*) @@ -189,6 +186,18 @@ fi concat_velox_param +function build_arrow { + echo "Start to build Arrow" + export SUDO=sudo + cd $GLUTEN_DIR/dev + source build_arrow.sh + prepare_arrow_build + build_arrow_cpp $BUILD_TYPE + echo "Finished building arrow CPP" + build_arrow_java + echo "Finished building arrow Java" +} + function build_velox { echo "Start to build Velox" cd $GLUTEN_DIR/ep/build-velox/src @@ -196,7 +205,7 @@ function build_velox { # When BUILD_TESTS is on for gluten cpp, we need turn on VELOX_BUILD_TEST_UTILS via build_test_utils. ./build_velox.sh --run_setup_script=$RUN_SETUP_SCRIPT --enable_s3=$ENABLE_S3 --enable_gcs=$ENABLE_GCS --build_type=$BUILD_TYPE --enable_hdfs=$ENABLE_HDFS \ --enable_abfs=$ENABLE_ABFS --enable_ep_cache=$ENABLE_EP_CACHE --build_test_utils=$BUILD_TESTS --build_tests=$BUILD_VELOX_TESTS --build_benchmarks=$BUILD_VELOX_BENCHMARKS \ - --compile_arrow_java=$COMPILE_ARROW_JAVA --num_threads=$NUM_THREADS + --num_threads=$NUM_THREADS } ## compile gluten cpp @@ -208,11 +217,15 @@ function build_gluten_cpp { cd build cmake -DBUILD_VELOX_BACKEND=ON -DCMAKE_BUILD_TYPE=$BUILD_TYPE \ -DBUILD_TESTS=$BUILD_TESTS -DBUILD_EXAMPLES=$BUILD_EXAMPLES -DBUILD_BENCHMARKS=$BUILD_BENCHMARKS -DBUILD_JEMALLOC=$BUILD_JEMALLOC \ - -DENABLE_HBM=$ENABLE_HBM -DENABLE_QAT=$ENABLE_QAT -DENABLE_IAA=$ENABLE_IAA -DENABLE_GCS=$ENABLE_GCS -DENABLE_S3=$ENABLE_S3 -DENABLE_HDFS=$ENABLE_HDFS -DENABLE_ABFS=$ENABLE_ABFS .. + -DENABLE_HBM=$ENABLE_HBM -DENABLE_QAT=$ENABLE_QAT -DENABLE_IAA=$ENABLE_IAA -DBUILD_PROTOBUF=$BUILD_PROTOBUF -DENABLE_GCS=$ENABLE_GCS \ + -DENABLE_S3=$ENABLE_S3 -DENABLE_HDFS=$ENABLE_HDFS -DENABLE_ABFS=$ENABLE_ABFS .. make -j $NUM_THREADS } function build_velox_backend { + if [ $BUILD_ARROW == "ON" ]; then + build_arrow + fi build_velox build_gluten_cpp } diff --git a/ep/build-velox/src/build_arrow_deps_centos8.sh b/ep/build-velox/src/build_arrow_deps_centos8.sh deleted file mode 100755 index 8dfc2af9375e..000000000000 --- a/ep/build-velox/src/build_arrow_deps_centos8.sh +++ /dev/null @@ -1,45 +0,0 @@ -#!/bin/bash -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -set -exu - -NPROC=$(getconf _NPROCESSORS_ONLN) - -function wget_and_untar { - local URL=$1 - local DIR=$2 - mkdir -p "${DIR}" - pushd "${DIR}" - curl -L "${URL}" > $2.tar.gz - tar -xz --strip-components=1 -f $2.tar.gz - popd -} - -function install_openssl { - wget_and_untar https://github.com/openssl/openssl/archive/refs/tags/OpenSSL_1_1_1s.tar.gz openssl - cd openssl - ./config no-shared && make depend && make && sudo make install - cd .. -} - -function install_arrow_deps { - install_openssl -} - -# Activate gcc9; enable errors on unset variables afterwards. -# source /opt/rh/gcc-toolset-9/enable || exit 1 -install_arrow_deps -echo "All dependencies for Arrow installed!" diff --git a/ep/build-velox/src/build_velox.sh b/ep/build-velox/src/build_velox.sh index 0224e9546861..97b3db5549bc 100755 --- a/ep/build-velox/src/build_velox.sh +++ b/ep/build-velox/src/build_velox.sh @@ -34,7 +34,6 @@ ENABLE_TESTS=OFF # Set to ON for gluten cpp test build. BUILD_TEST_UTILS=OFF RUN_SETUP_SCRIPT=ON -COMPILE_ARROW_JAVA=ON NUM_THREADS="" OTHER_ARGUMENTS="" @@ -87,10 +86,6 @@ for arg in "$@"; do RUN_SETUP_SCRIPT=("${arg#*=}") shift # Remove argument name from processing ;; - --compile_arrow_java=*) - COMPILE_ARROW_JAVA=("${arg#*=}") - shift # Remove argument name from processing - ;; --num_threads=*) NUM_THREADS=("${arg#*=}") shift # Remove argument name from processing @@ -191,7 +186,7 @@ function get_build_summary { echo "ENABLE_S3=$ENABLE_S3,ENABLE_GCS=$ENABLE_GCS,ENABLE_HDFS=$ENABLE_HDFS,ENABLE_ABFS=$ENABLE_ABFS,\ BUILD_TYPE=$BUILD_TYPE,VELOX_HOME=$VELOX_HOME,ENABLE_BENCHMARK=$ENABLE_BENCHMARK,\ ENABLE_TESTS=$ENABLE_TESTS,BUILD_TEST_UTILS=$BUILD_TEST_UTILS,\ -COMPILE_ARROW_JAVA=$COMPILE_ARROW_JAVA,OTHER_ARGUMENTS=$OTHER_ARGUMENTS,COMMIT_HASH=$COMMIT_HASH" +OTHER_ARGUMENTS=$OTHER_ARGUMENTS,COMMIT_HASH=$COMMIT_HASH" } function check_commit { @@ -277,34 +272,6 @@ function setup_linux { fi } -function compile_arrow_java_module() { - ARROW_HOME="${VELOX_HOME}/_build/$COMPILE_TYPE/third_party/arrow_ep/src/arrow_ep" - ARROW_INSTALL_DIR="${ARROW_HOME}/../../install" - - pushd $ARROW_HOME/java - # Because arrow-bom module need the -DprocessAllModules - mvn versions:set -DnewVersion=15.0.0-gluten -DprocessAllModules - - mvn clean install -pl bom,maven/module-info-compiler-maven-plugin,vector -am \ - -DskipTests -Drat.skip -Dmaven.gitcommitid.skip -Dcheckstyle.skip -Dassembly.skipAssembly - - # Arrow C Data Interface CPP libraries - mvn generate-resources -P generate-libs-cdata-all-os -Darrow.c.jni.dist.dir=$ARROW_INSTALL_DIR \ - -Dmaven.test.skip -Drat.skip -Dmaven.gitcommitid.skip -Dcheckstyle.skip -N - - # Arrow JNI Date Interface CPP libraries - export PKG_CONFIG_PATH=/usr/local/lib64/pkgconfig${PKG_CONFIG_PATH:+:${PKG_CONFIG_PATH}} - mvn generate-resources -Pgenerate-libs-jni-macos-linux -N -Darrow.dataset.jni.dist.dir=$ARROW_INSTALL_DIR \ - -DARROW_GANDIVA=OFF -DARROW_JAVA_JNI_ENABLE_GANDIVA=OFF -DARROW_ORC=OFF -DARROW_JAVA_JNI_ENABLE_ORC=OFF \ - -Dmaven.test.skip -Drat.skip -Dmaven.gitcommitid.skip -Dcheckstyle.skip -N - - # Arrow Java libraries - mvn install -Parrow-jni -P arrow-c-data -pl c,dataset -am \ - -Darrow.c.jni.dist.dir=$ARROW_INSTALL_DIR/lib -Darrow.dataset.jni.dist.dir=$ARROW_INSTALL_DIR/lib -Darrow.cpp.build.dir=$ARROW_INSTALL_DIR/lib \ - -Dmaven.test.skip -Drat.skip -Dmaven.gitcommitid.skip -Dcheckstyle.skip -Dassembly.skipAssembly - popd -} - CURRENT_DIR=$( cd "$(dirname "$BASH_SOURCE")" pwd @@ -334,9 +301,5 @@ echo "Target Velox build: $TARGET_BUILD_SUMMARY" check_commit compile -if [ $COMPILE_ARROW_JAVA == "ON" ]; then - compile_arrow_java_module -fi - echo "Successfully built Velox from Source." echo $TARGET_BUILD_SUMMARY >"${VELOX_HOME}/velox-build.cache"