Skip to content

Commit

Permalink
[VL] Support building arrow CPP and finding installed arrow libs from…
Browse files Browse the repository at this point in the history
… system (#6229)
  • Loading branch information
PHILO-HE authored Jun 28, 2024
1 parent 2bb813c commit 1b8be1d
Show file tree
Hide file tree
Showing 7 changed files with 239 additions and 99 deletions.
10 changes: 4 additions & 6 deletions .github/workflows/velox_docker.yml
Original file line number Diff line number Diff line change
Expand Up @@ -600,13 +600,11 @@ jobs:
- name: Build Gluten Velox third party
if: ${{ steps.cache.outputs.cache-hit != 'true' }}
run: |
cd ep/build-velox/src
./get_velox.sh
source /opt/rh/gcc-toolset-9/enable
./build_arrow_deps_centos8.sh
./build_velox.sh --run_setup_script=OFF --enable_ep_cache=OFF --build_test_utils=ON
cd $GITHUB_WORKSPACE/cpp
./compile.sh --build_velox_backend=ON --build_protobuf=ON --build_tests=ON --build_examples=ON --build_benchmarks=ON
source ./dev/build_arrow.sh
install_arrow_deps
./dev/builddeps-veloxbe.sh --run_setup_script=OFF --enable_ep_cache=OFF --build_tests=ON \
--build_examples=ON --build_benchmarks=ON --build_protobuf=ON
- uses: actions/upload-artifact@v2
with:
name: velox-native-lib-centos-8-${{github.sha}}
Expand Down
5 changes: 3 additions & 2 deletions cpp/CMake/ConfigArrow.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -33,11 +33,12 @@ function(FIND_ARROW_LIB LIB_NAME)
set(ARROW_LIB_FULL_NAME
${CMAKE_SHARED_LIBRARY_PREFIX}${LIB_NAME}${ARROW_SHARED_LIBRARY_SUFFIX})
add_library(Arrow::${LIB_NAME} SHARED IMPORTED)
# Firstly find the lib from velox's arrow build path. If not found, try to
# find it from system.
find_library(
ARROW_LIB_${LIB_NAME}
NAMES ${ARROW_LIB_FULL_NAME}
PATHS ${ARROW_LIB_DIR} ${ARROW_LIB64_DIR}
NO_DEFAULT_PATH)
PATHS ${ARROW_LIB_DIR} ${ARROW_LIB64_DIR})
if(NOT ARROW_LIB_${LIB_NAME})
message(FATAL_ERROR "Arrow library Not Found: ${ARROW_LIB_FULL_NAME}")
else()
Expand Down
92 changes: 92 additions & 0 deletions dev/build_arrow.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
#!/bin/bash
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

CURRENT_DIR=$(cd "$(dirname "$BASH_SOURCE")"; pwd)
source ${CURRENT_DIR}/build_helper_functions.sh
VELOX_ARROW_BUILD_VERSION=15.0.0
ARROW_PREFIX=$CURRENT_DIR/arrow_ep
# Always uses BUNDLED in case of that thrift is not installed.
THRIFT_SOURCE="BUNDLED"
BUILD_TYPE=Release

function prepare_arrow_build() {
sudo rm -rf arrow_ep/
wget_and_untar https://archive.apache.org/dist/arrow/arrow-${VELOX_ARROW_BUILD_VERSION}/apache-arrow-${VELOX_ARROW_BUILD_VERSION}.tar.gz arrow_ep
cd arrow_ep/
patch -p1 < $CURRENT_DIR/../ep/build-velox/src/modify_arrow.patch
patch -p1 < $CURRENT_DIR/../ep/build-velox/src/modify_arrow_dataset_scan_option.patch
}

function install_arrow_deps {
wget_and_untar https://github.com/openssl/openssl/archive/refs/tags/OpenSSL_1_1_1s.tar.gz openssl
pushd openssl
./config no-shared && make depend && make && sudo make install
popd
}

function build_arrow_cpp() {
if [ -n "$1" ]; then
BUILD_TYPE=$1
fi
pushd $ARROW_PREFIX/cpp

cmake_install \
-DARROW_PARQUET=ON \
-DARROW_FILESYSTEM=ON \
-DARROW_PROTOBUF_USE_SHARED=OFF \
-DARROW_WITH_THRIFT=ON \
-DARROW_WITH_LZ4=ON \
-DARROW_WITH_SNAPPY=ON \
-DARROW_WITH_ZLIB=ON \
-DARROW_WITH_ZSTD=ON \
-DARROW_JEMALLOC=OFF \
-DARROW_SIMD_LEVEL=NONE \
-DARROW_RUNTIME_SIMD_LEVEL=NONE \
-DARROW_WITH_UTF8PROC=OFF \
-DARROW_TESTING=ON \
-DCMAKE_INSTALL_PREFIX=/usr/local \
-DCMAKE_BUILD_TYPE=${BUILD_TYPE} \
-DARROW_BUILD_STATIC=ON \
-DThrift_SOURCE=${THRIFT_SOURCE}
popd
}

function build_arrow_java() {
ARROW_INSTALL_DIR="${ARROW_PREFIX}/install"

pushd $ARROW_PREFIX/java
# Because arrow-bom module need the -DprocessAllModules
mvn versions:set -DnewVersion=15.0.0-gluten -DprocessAllModules

mvn clean install -pl bom,maven/module-info-compiler-maven-plugin,vector -am \
-DskipTests -Drat.skip -Dmaven.gitcommitid.skip -Dcheckstyle.skip -Dassembly.skipAssembly

# Arrow C Data Interface CPP libraries
mvn generate-resources -P generate-libs-cdata-all-os -Darrow.c.jni.dist.dir=$ARROW_INSTALL_DIR \
-Dmaven.test.skip -Drat.skip -Dmaven.gitcommitid.skip -Dcheckstyle.skip -N

# Arrow JNI Date Interface CPP libraries
export PKG_CONFIG_PATH=/usr/local/lib64/pkgconfig${PKG_CONFIG_PATH:+:${PKG_CONFIG_PATH}}
mvn generate-resources -Pgenerate-libs-jni-macos-linux -N -Darrow.dataset.jni.dist.dir=$ARROW_INSTALL_DIR \
-DARROW_GANDIVA=OFF -DARROW_JAVA_JNI_ENABLE_GANDIVA=OFF -DARROW_ORC=OFF -DARROW_JAVA_JNI_ENABLE_ORC=OFF \
-Dmaven.test.skip -Drat.skip -Dmaven.gitcommitid.skip -Dcheckstyle.skip -N

# Arrow Java libraries
mvn install -Parrow-jni -P arrow-c-data -pl c,dataset -am \
-Darrow.c.jni.dist.dir=$ARROW_INSTALL_DIR/lib -Darrow.dataset.jni.dist.dir=$ARROW_INSTALL_DIR/lib -Darrow.cpp.build.dir=$ARROW_INSTALL_DIR/lib \
-Dmaven.test.skip -Drat.skip -Dmaven.gitcommitid.skip -Dcheckstyle.skip -Dassembly.skipAssembly
popd
}
118 changes: 118 additions & 0 deletions dev/build_helper_functions.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
#!/bin/bash
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

function get_cxx_flags {
local CPU_ARCH=$1

local OS
OS=$(uname)
local MACHINE
MACHINE=$(uname -m)
ADDITIONAL_FLAGS=""

if [[ -z "$CPU_ARCH" ]] || [[ $CPU_ARCH == "unknown" ]]; then
if [ "$OS" = "Darwin" ]; then

if [ "$MACHINE" = "x86_64" ]; then
local CPU_CAPABILITIES
CPU_CAPABILITIES=$(sysctl -a | grep machdep.cpu.features | awk '{print tolower($0)}')

if [[ $CPU_CAPABILITIES =~ "avx" ]]; then
CPU_ARCH="avx"
else
CPU_ARCH="sse"
fi

elif [[ $(sysctl -a | grep machdep.cpu.brand_string) =~ "Apple" ]]; then
# Apple silicon.
CPU_ARCH="arm64"
fi

# On MacOs prevent the flood of translation visibility settings warnings.
ADDITIONAL_FLAGS="-fvisibility=hidden -fvisibility-inlines-hidden"
else [ "$OS" = "Linux" ];

local CPU_CAPABILITIES
CPU_CAPABILITIES=$(cat /proc/cpuinfo | grep flags | head -n 1| awk '{print tolower($0)}')

if [[ "$CPU_CAPABILITIES" =~ "avx" ]]; then
CPU_ARCH="avx"
elif [[ "$CPU_CAPABILITIES" =~ "sse" ]]; then
CPU_ARCH="sse"
elif [ "$MACHINE" = "aarch64" ]; then
CPU_ARCH="aarch64"
fi
fi
fi

case $CPU_ARCH in

"arm64")
echo -n "-mcpu=apple-m1+crc -std=c++17 -fvisibility=hidden $ADDITIONAL_FLAGS"
;;

"avx")
echo -n "-mavx2 -mfma -mavx -mf16c -mlzcnt -std=c++17 -mbmi2 $ADDITIONAL_FLAGS"
;;

"sse")
echo -n "-msse4.2 -std=c++17 $ADDITIONAL_FLAGS"
;;

"aarch64")
echo -n "-mcpu=neoverse-n1 -std=c++17 $ADDITIONAL_FLAGS"
;;
*)
echo -n "Architecture not supported!"
esac

}

function wget_and_untar {
local URL=$1
local DIR=$2
mkdir -p "${DIR}"
pushd "${DIR}"
curl -L "${URL}" > $2.tar.gz
tar -xz --strip-components=1 -f $2.tar.gz
popd
}

function cmake_install {
local NAME=$(basename "$(pwd)")
local BINARY_DIR=_build
SUDO="${SUDO:-""}"
if [ -d "${BINARY_DIR}" ] && prompt "Do you want to rebuild ${NAME}?"; then
${SUDO} rm -rf "${BINARY_DIR}"
fi
mkdir -p "${BINARY_DIR}"
CPU_TARGET="${CPU_TARGET:-unknown}"
COMPILER_FLAGS=$(get_cxx_flags $CPU_TARGET)

# CMAKE_POSITION_INDEPENDENT_CODE is required so that Velox can be built into dynamic libraries \
cmake -Wno-dev -B"${BINARY_DIR}" \
-GNinja \
-DCMAKE_POSITION_INDEPENDENT_CODE=ON \
-DCMAKE_CXX_STANDARD=17 \
"${INSTALL_PREFIX+-DCMAKE_PREFIX_PATH=}${INSTALL_PREFIX-}" \
"${INSTALL_PREFIX+-DCMAKE_INSTALL_PREFIX=}${INSTALL_PREFIX-}" \
-DCMAKE_CXX_FLAGS="$COMPILER_FLAGS" \
-DBUILD_TESTING=OFF \
"$@"

cmake --build "${BINARY_DIR}"
${SUDO} cmake --install "${BINARY_DIR}"
}
29 changes: 21 additions & 8 deletions dev/builddeps-veloxbe.sh
Original file line number Diff line number Diff line change
Expand Up @@ -24,14 +24,13 @@ ENABLE_S3=OFF
ENABLE_HDFS=OFF
ENABLE_ABFS=OFF
ENABLE_EP_CACHE=OFF
ARROW_ENABLE_CUSTOM_CODEC=OFF
ENABLE_VCPKG=OFF
RUN_SETUP_SCRIPT=ON
VELOX_REPO=""
VELOX_BRANCH=""
VELOX_HOME=""
VELOX_PARAMETER=""
COMPILE_ARROW_JAVA=ON
BUILD_ARROW=ON
SPARK_VERSION=ALL

# set default number of threads as cpu cores minus 2
Expand Down Expand Up @@ -72,12 +71,10 @@ do
;;
--enable_qat=*)
ENABLE_QAT=("${arg#*=}")
ARROW_ENABLE_CUSTOM_CODEC=("${arg#*=}")
shift # Remove argument name from processing
;;
--enable_iaa=*)
ENABLE_IAA=("${arg#*=}")
ARROW_ENABLE_CUSTOM_CODEC=("${arg#*=}")
shift # Remove argument name from processing
;;
--enable_hbm=*)
Expand Down Expand Up @@ -136,8 +133,8 @@ do
BUILD_VELOX_BENCHMARKS=("${arg#*=}")
shift # Remove argument name from processing
;;
--compile_arrow_java=*)
COMPILE_ARROW_JAVA=("${arg#*=}")
--build_arrow=*)
BUILD_ARROW=("${arg#*=}")
shift # Remove argument name from processing
;;
--num_threads=*)
Expand Down Expand Up @@ -189,14 +186,26 @@ fi

concat_velox_param

function build_arrow {
echo "Start to build Arrow"
export SUDO=sudo
cd $GLUTEN_DIR/dev
source build_arrow.sh
prepare_arrow_build
build_arrow_cpp $BUILD_TYPE
echo "Finished building arrow CPP"
build_arrow_java
echo "Finished building arrow Java"
}

function build_velox {
echo "Start to build Velox"
cd $GLUTEN_DIR/ep/build-velox/src
./get_velox.sh --enable_hdfs=$ENABLE_HDFS --build_protobuf=$BUILD_PROTOBUF --enable_s3=$ENABLE_S3 --enable_gcs=$ENABLE_GCS --enable_abfs=$ENABLE_ABFS $VELOX_PARAMETER
# When BUILD_TESTS is on for gluten cpp, we need turn on VELOX_BUILD_TEST_UTILS via build_test_utils.
./build_velox.sh --run_setup_script=$RUN_SETUP_SCRIPT --enable_s3=$ENABLE_S3 --enable_gcs=$ENABLE_GCS --build_type=$BUILD_TYPE --enable_hdfs=$ENABLE_HDFS \
--enable_abfs=$ENABLE_ABFS --enable_ep_cache=$ENABLE_EP_CACHE --build_test_utils=$BUILD_TESTS --build_tests=$BUILD_VELOX_TESTS --build_benchmarks=$BUILD_VELOX_BENCHMARKS \
--compile_arrow_java=$COMPILE_ARROW_JAVA --num_threads=$NUM_THREADS
--num_threads=$NUM_THREADS
}

## compile gluten cpp
Expand All @@ -208,11 +217,15 @@ function build_gluten_cpp {
cd build
cmake -DBUILD_VELOX_BACKEND=ON -DCMAKE_BUILD_TYPE=$BUILD_TYPE \
-DBUILD_TESTS=$BUILD_TESTS -DBUILD_EXAMPLES=$BUILD_EXAMPLES -DBUILD_BENCHMARKS=$BUILD_BENCHMARKS -DBUILD_JEMALLOC=$BUILD_JEMALLOC \
-DENABLE_HBM=$ENABLE_HBM -DENABLE_QAT=$ENABLE_QAT -DENABLE_IAA=$ENABLE_IAA -DENABLE_GCS=$ENABLE_GCS -DENABLE_S3=$ENABLE_S3 -DENABLE_HDFS=$ENABLE_HDFS -DENABLE_ABFS=$ENABLE_ABFS ..
-DENABLE_HBM=$ENABLE_HBM -DENABLE_QAT=$ENABLE_QAT -DENABLE_IAA=$ENABLE_IAA -DBUILD_PROTOBUF=$BUILD_PROTOBUF -DENABLE_GCS=$ENABLE_GCS \
-DENABLE_S3=$ENABLE_S3 -DENABLE_HDFS=$ENABLE_HDFS -DENABLE_ABFS=$ENABLE_ABFS ..
make -j $NUM_THREADS
}

function build_velox_backend {
if [ $BUILD_ARROW == "ON" ]; then
build_arrow
fi
build_velox
build_gluten_cpp
}
Expand Down
45 changes: 0 additions & 45 deletions ep/build-velox/src/build_arrow_deps_centos8.sh

This file was deleted.

Loading

0 comments on commit 1b8be1d

Please sign in to comment.