Skip to content

Commit

Permalink
[GLUTEN-3572][VL] Remove --arrow_home option and fix "libarrow not fo…
Browse files Browse the repository at this point in the history
…und" in debug mode (#3573)
  • Loading branch information
Surbhi-Vijay authored Nov 8, 2023
1 parent a090556 commit 29b5899
Show file tree
Hide file tree
Showing 5 changed files with 41 additions and 81 deletions.
19 changes: 9 additions & 10 deletions .github/workflows/velox_be.yml
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,6 @@ on:
- 'shims/**'
- 'tools/gluten-it/**'
- 'tools/gluten-te/**'
- 'ep/build-arrow/**'
- 'ep/build-velox/**'
- 'cpp/*'
- 'cpp/CMake/**'
Expand Down Expand Up @@ -62,7 +61,7 @@ jobs:
run: |
docker exec ubuntu2004-test-$GITHUB_RUN_ID bash -c '
cd /opt/gluten/cpp && \
./compile.sh --build_velox_backend=ON --velox_home=/opt/velox --arrow_home=/opt/velox/_build/release/third_party/arrow_ep --build_tests=ON --build_examples=ON --build_benchmarks=ON'
./compile.sh --build_velox_backend=ON --velox_home=/opt/velox --build_tests=ON --build_examples=ON --build_benchmarks=ON'
- name: Run CPP unit test
run: |
docker exec ubuntu2004-test-$GITHUB_RUN_ID bash -c 'cd /opt/gluten/cpp/build && \
Expand Down Expand Up @@ -108,7 +107,7 @@ jobs:
run: |
docker exec ubuntu2004-test-slow-$GITHUB_RUN_ID bash -c '
cd /opt/gluten/cpp && \
./compile.sh --build_velox_backend=ON --velox_home=/opt/velox --arrow_home=/opt/velox/_build/release/third_party/arrow_ep'
./compile.sh --build_velox_backend=ON --velox_home=/opt/velox'
- name: Build and run unit test for Spark 3.2.2(slow tests)
run: |
docker exec ubuntu2004-test-slow-$GITHUB_RUN_ID bash -c '
Expand Down Expand Up @@ -146,7 +145,7 @@ jobs:
run: |
docker exec ubuntu2004-test-spark33-slow-$GITHUB_RUN_ID bash -l -c '
cd /opt/gluten/cpp && \
./compile.sh --build_velox_backend=ON --velox_home=/opt/velox --arrow_home=/opt/velox/_build/release/third_party/arrow_ep'
./compile.sh --build_velox_backend=ON --velox_home=/opt/velox'
- name: Build and Run unit test for Spark 3.3.1(slow tests)
run: |
docker exec ubuntu2004-test-spark33-slow-$GITHUB_RUN_ID bash -l -c 'cd /opt/gluten && \
Expand Down Expand Up @@ -183,7 +182,7 @@ jobs:
run: |
docker exec ubuntu2004-test-spark33-$GITHUB_RUN_ID bash -c '
cd /opt/gluten/cpp && \
./compile.sh --build_velox_backend=ON --velox_home=/opt/velox --arrow_home=/opt/velox/_build/release/third_party/arrow_ep --build_examples=ON'
./compile.sh --build_velox_backend=ON --velox_home=/opt/velox --build_examples=ON'
- name: Build and Run unit test for Spark 3.3.1(other tests)
run: |
docker exec ubuntu2004-test-spark33-$GITHUB_RUN_ID bash -c 'cd /opt/gluten && \
Expand Down Expand Up @@ -213,7 +212,7 @@ jobs:
run: |
docker exec ubuntu2004-test-spark34-slow-$GITHUB_RUN_ID bash -l -c '
cd /opt/gluten/cpp && \
./compile.sh --build_velox_backend=ON --velox_home=/opt/velox --arrow_home=/opt/velox/_build/release/third_party/arrow_ep'
./compile.sh --build_velox_backend=ON --velox_home=/opt/velox '
- name: Build and Run unit test for Spark 3.4.1(slow tests)
run: |
docker exec ubuntu2004-test-spark34-slow-$GITHUB_RUN_ID bash -l -c 'cd /opt/gluten && \
Expand Down Expand Up @@ -250,7 +249,7 @@ jobs:
run: |
docker exec ubuntu2004-test-spark34-$GITHUB_RUN_ID bash -c '
cd /opt/gluten/cpp && \
./compile.sh --build_velox_backend=ON --velox_home=/opt/velox --arrow_home=/opt/velox/_build/release/third_party/arrow_ep --build_examples=ON'
./compile.sh --build_velox_backend=ON --velox_home=/opt/velox --build_examples=ON'
- name: Build and Run unit test for Spark 3.4.1(other tests)
run: |
docker exec ubuntu2004-test-spark34-$GITHUB_RUN_ID bash -c 'cd /opt/gluten && \
Expand Down Expand Up @@ -280,7 +279,7 @@ jobs:
run: |
docker exec ubuntu2204-test-$GITHUB_RUN_ID bash -c '
cd /opt/gluten/cpp && \
./compile.sh --build_velox_backend=ON --velox_home=/opt/velox --arrow_home=/opt/velox/_build/release/third_party/arrow_ep --enable_hdfs=ON --enable_s3=ON'
./compile.sh --build_velox_backend=ON --velox_home=/opt/velox --enable_hdfs=ON --enable_s3=ON'
- name: Build for Spark 3.2.2
run: |
docker exec ubuntu2204-test-$GITHUB_RUN_ID bash -c '
Expand Down Expand Up @@ -361,7 +360,7 @@ jobs:
docker exec centos8-test-$GITHUB_RUN_ID bash -c '
source /env.sh && \
cd /opt/gluten/cpp && \
./compile.sh --build_velox_backend=ON --velox_home=/opt/velox --arrow_home=/opt/velox/_build/release/third_party/arrow_ep --enable_hdfs=ON --enable_s3=ON'
./compile.sh --build_velox_backend=ON --velox_home=/opt/velox --enable_hdfs=ON --enable_s3=ON'
- name: Build for Spark 3.2.2
run: |
docker exec centos8-test-$GITHUB_RUN_ID bash -c '
Expand Down Expand Up @@ -410,7 +409,7 @@ jobs:
docker exec centos7-test-$GITHUB_RUN_ID bash -c '
source /env.sh && \
cd /opt/gluten/cpp && \
./compile.sh --build_velox_backend=ON --velox_home=/opt/velox --arrow_home=/opt/velox/_build/release/third_party/arrow_ep'
./compile.sh --build_velox_backend=ON --velox_home=/opt/velox'
- name: Build for Spark 3.2.2
run: |
docker exec centos7-test-$GITHUB_RUN_ID bash -c '
Expand Down
15 changes: 9 additions & 6 deletions cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -55,12 +55,15 @@ option(ENABLE_ORC "Enable ORC" OFF)
set(root_directory ${PROJECT_BINARY_DIR})
get_filename_component(GLUTEN_HOME ${CMAKE_SOURCE_DIR} DIRECTORY)

if (NOT DEFINED ARROW_HOME)
if (${CMAKE_BUILD_TYPE} STREQUAL "Debug")
set(ARROW_HOME ${GLUTEN_HOME}/ep/build-velox/build/velox_ep/_build/debug/third_party/arrow_ep)
else()
set(ARROW_HOME ${GLUTEN_HOME}/ep/build-velox/build/velox_ep/_build/release/third_party/arrow_ep)
endif()
if (NOT DEFINED VELOX_HOME)
set(VELOX_HOME ${GLUTEN_HOME}/ep/build-velox/build/velox_ep)
message(STATUS "Set VELOX_HOME to ${VELOX_HOME}")
endif()

if (${CMAKE_BUILD_TYPE} STREQUAL "Debug")
set(ARROW_HOME ${VELOX_HOME}/_build/debug/third_party/arrow_ep)
else()
set(ARROW_HOME ${VELOX_HOME}/_build/release/third_party/arrow_ep)
endif()

#
Expand Down
12 changes: 0 additions & 12 deletions cpp/compile.sh
Original file line number Diff line number Diff line change
Expand Up @@ -27,16 +27,11 @@ ENABLE_QAT=OFF
ENABLE_HBM=OFF
ENABLE_S3=OFF
ENABLE_HDFS=OFF
ARROW_HOME=
VELOX_HOME=
NPROC=$(nproc --ignore=2)

for arg in "$@"; do
case $arg in
--arrow_home=*)
ARROW_HOME=("${arg#*=}")
shift # Remove argument name from processing
;;
--velox_home=*)
VELOX_HOME=("${arg#*=}")
shift # Remove argument name from processing
Expand Down Expand Up @@ -97,19 +92,13 @@ CURRENT_DIR=$(
pwd
)

#gluten cpp will find arrow/parquet lib from ARROW_HOME
if [ "$ARROW_HOME" == "" ]; then
ARROW_HOME="$CURRENT_DIR/../ep/build-velox/build/velox_ep/_build/release/third_party/arrow_ep"
fi

#gluten cpp will find velox lib from VELOX_HOME
if [ "$VELOX_HOME" == "" ]; then
VELOX_HOME="$CURRENT_DIR/../ep/build-velox/build/velox_ep"
fi

echo "Building gluten cpp part..."
echo "CMAKE Arguments:"
echo "ARROW_HOME=${ARROW_HOME}"
echo "VELOX_HOME=${VELOX_HOME}"
echo "BUILD_TYPE=${BUILD_TYPE}"
echo "BUILD_VELOX_BACKEND=${BUILD_VELOX_BACKEND}"
Expand All @@ -130,7 +119,6 @@ cd build
cmake .. \
-DBUILD_TESTS=${BUILD_TESTS} \
-DBUILD_EXAMPLES=${BUILD_EXAMPLES} \
-DARROW_HOME=${ARROW_HOME} \
-DBUILD_JEMALLOC=${BUILD_JEMALLOC} \
-DBUILD_VELOX_BACKEND=${BUILD_VELOX_BACKEND} \
-DVELOX_HOME=${VELOX_HOME} \
Expand Down
46 changes: 15 additions & 31 deletions docs/get-started/GlutenUsage.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,37 +8,21 @@ parent: Getting-Started
## Parameters for buildbundle-veloxbe.sh or builddeps-veloxbe.sh
Please set them via `--`, e.g. `--build_type=Release`.

| Parameters | Description | Default value |
|------------------|-------------------------------------------------------------|---------------|
| build_type | build type for arrow, velox & gluten cpp, CMAKE_BUILD_TYPE | Release |
| build_tests | build test code in cpp folder and arrow | OFF |
| build_benchmarks | build benchmark code in cpp folder and arrow | OFF |
| build_jemalloc | build with jemalloc | ON |
| build_protobuf | build protobuf lib | ON |
| enable_qat | enable QAT for shuffle data de/compression | OFF |
| enable_iaa | enable IAA for shuffle data de/compression | OFF |
| enable_hbm | enable HBM allocator | OFF |
| enable_s3 | build with s3 lib | OFF |
| enable_hdfs | build with hdfs lib | OFF |
| enable_ep_cache | enable caching for external project build (Arrow and Velox) | OFF |
| skip_build_ep | skip the build of external projects (arrow, velox) | OFF |
| enable_vcpkg | enable vcpkg for static build | OFF |

## Parameters for get_arrow.sh
Please set them via `--`, e.g., `--enable_custom_codec=ON`.

| Parameters | Description | Default value |
| ---------- | ----------- | ------------- |
| enable_custom_codec | Apply patch to plugin custom codec (used by QAT/IAA) in Arrow cpp IPC module. | OFF |

## Parameters for build_arrow.sh
Please set them via `--`, e.g., `--arrow_home=/YOUR/PATH`.

| Parameters | Description | Default value |
| ---------- | ----------- | ------------- |
| arrow_home | Arrow build path | GLUTEN_DIR/ep/build-arrow/build|
| build_type | ARROW build type, CMAKE_BUILD_TYPE | Release|
| build_tests | Build arrow with -DARROW_JSON=ON | OFF |
| Parameters | Description | Default value |
|------------------|-----------------------------------------------------|---------------|
| build_type | build type for velox & gluten cpp, CMAKE_BUILD_TYPE | Release |
| build_tests | build test code in cpp folder | OFF |
| build_benchmarks | build benchmark code in cpp folder | OFF |
| build_jemalloc | build with jemalloc | ON |
| build_protobuf | build protobuf lib | ON |
| enable_qat | enable QAT for shuffle data de/compression | OFF |
| enable_iaa | enable IAA for shuffle data de/compression | OFF |
| enable_hbm | enable HBM allocator | OFF |
| enable_s3 | build with s3 lib | OFF |
| enable_hdfs | build with hdfs lib | OFF |
| enable_ep_cache | enable caching for external project build (Velox) | OFF |
| skip_build_ep | skip the build of external projects (velox) | OFF |
| enable_vcpkg | enable vcpkg for static build | OFF |

## Parameters for build_velox.sh
Please set them via `--`, e.g., `--velox_home=/YOUR/PATH`.
Expand Down
30 changes: 8 additions & 22 deletions docs/get-started/Velox.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ title: Gluten with Velox Backend
nav_order: 1
parent: Getting-Started
---
Currently, the mvn script can automatically fetch and build all dependency libraries incluing Velox and Arrow. Our nightly build still use Velox under oap-project.
Currently, the mvn script can automatically fetch and build all dependency libraries incluing Velox. Our nightly build still use Velox under oap-project.

# Prerequisite

Expand All @@ -13,14 +13,7 @@ common OS and conda env deployment.

Gluten builds with Spark3.2.x and Spark3.3.x now but only fully tested in CI with 3.2.2 and 3.3.1. We will add/update supported/tested versions according to the upstream changes.

Velox uses the script `scripts/setup-xxx.sh` to install all dependency libraries, but Arrow's dependency libraries are not installed. Velox also requires ninja for compilation.
So we need to install all of them manually. Also, we need to set up the `JAVA_HOME` env. Currently, **java 8** is required and the support for java 11/17 is not ready.

```bash
## run as root
## install gcc and libraries to build arrow
apt-get update && apt-get install -y sudo locales wget tar tzdata git ccache cmake ninja-build build-essential llvm-11-dev clang-11 libiberty-dev libdwarf-dev libre2-dev libz-dev libssl-dev libboost-all-dev libcurl4-openssl-dev openjdk-8-jdk maven
```
we need to set up the `JAVA_HOME` env. Currently, **java 8** is required and the support for java 11/17 is not ready.

**For x86_64**

Expand Down Expand Up @@ -61,7 +54,7 @@ cd /path/to/gluten
./dev/buildbundle-veloxbe.sh

## After a complete build, if you need to re-build the project and only some gluten code is changed,
## you can use the following command to skip building arrow, velox and protobuf.
## you can use the following command to skip building velox and protobuf.
# ./dev/buildbundle-veloxbe.sh --skip_build_ep=ON --build_protobuf=OFF
```

Expand All @@ -75,30 +68,23 @@ cd /path/to/gluten
./dev/builddeps-veloxbe.sh
```

**Build Velox or Arrow separately**
**Build Velox separately**

Scripts under `/path/to/gluten/ep/build-xxx/src` provide `get_xxx.sh` and `build_xxx.sh` to build Velox or Arrow separately, you could use these scripts with custom repo/branch/location.
Scripts under `/path/to/gluten/ep/build-velox/src` provide `get_velox.sh` and `build_velox.sh` to build Velox separately, you could use these scripts with custom repo/branch/location.

Velox can use pre-build arrow/parquet lib from ARROW_HOME parsed by --arrow_home to decrease build time.
Gluten cpp module need a required VELOX_HOME parsed by --velox_home and an optional ARROW_HOME by --arrow_home, if you specify custom ep location, make sure these variables be passed correctly.
Velox provides arrow/parquet lib. Gluten cpp module need a required VELOX_HOME parsed by --velox_home, if you specify custom ep location, make sure these variables be passed correctly.

```bash
## fetch Arrow and compile
cd /path/to/gluten/ep/build-arrow/src/
## you could use custom ep location by --arrow_home=custom_path, make sure specify --arrow_home in build_arrow.sh too.
./get_arrow.sh
./build_arrow.sh

## fetch Velox and compile
cd /path/to/gluten/ep/build-velox/src/
## you could use custom ep location by --velox_home=custom_path, make sure specify --velox_home in build_velox.sh too.
./get_velox.sh
## make sure specify --arrow_home or --velox_home if you have specified it in get_xxx.sh.
## make sure specify --velox_home if you have specified it in get_velox.sh.
./build_velox.sh

## compile Gluten cpp module
cd /path/to/gluten/cpp
## if you use custom velox_home or arrow_home, make sure specified here by --arrow_home or --velox_home
## if you use custom velox_home, make sure specified here by --velox_home
./compile.sh --build_velox_backend=ON

## compile Gluten java module and create package jar
Expand Down

0 comments on commit 29b5899

Please sign in to comment.