From 9140a65719f13cf74127e045565909a6eeb8bf43 Mon Sep 17 00:00:00 2001 From: Surbhi Vijayvargeeya Date: Tue, 31 Oct 2023 13:33:49 +0530 Subject: [PATCH 1/7] Updated documentation --- cpp/compile.sh | 6 +++++- docs/get-started/Velox.md | 13 +++---------- 2 files changed, 8 insertions(+), 11 deletions(-) diff --git a/cpp/compile.sh b/cpp/compile.sh index de6b0f36efd5..62c32830c7fb 100755 --- a/cpp/compile.sh +++ b/cpp/compile.sh @@ -99,7 +99,11 @@ CURRENT_DIR=$( #gluten cpp will find arrow/parquet lib from ARROW_HOME if [ "$ARROW_HOME" == "" ]; then - ARROW_HOME="$CURRENT_DIR/../ep/build-velox/build/velox_ep/_build/release/third_party/arrow_ep" + if ["$BUILD_TYPE" == "Debug"] then + ARROW_HOME="$CURRENT_DIR/../ep/build-velox/build/velox_ep/_build/debug/third_party/arrow_ep" + else + ARROW_HOME="$CURRENT_DIR/../ep/build-velox/build/velox_ep/_build/release/third_party/arrow_ep" + fi fi #gluten cpp will find velox lib from VELOX_HOME diff --git a/docs/get-started/Velox.md b/docs/get-started/Velox.md index 8a30df2fdc37..8a032add2814 100644 --- a/docs/get-started/Velox.md +++ b/docs/get-started/Velox.md @@ -75,20 +75,13 @@ cd /path/to/gluten ./dev/builddeps-veloxbe.sh ``` -**Build Velox or Arrow separately** +**Build Velox separately** -Scripts under `/path/to/gluten/ep/build-xxx/src` provide `get_xxx.sh` and `build_xxx.sh` to build Velox or Arrow separately, you could use these scripts with custom repo/branch/location. +Scripts under `/path/to/gluten/ep/build-xxx/src` provide `get_xxx.sh` and `build_xxx.sh` to build Velox separately, you could use these scripts with custom repo/branch/location. -Velox can use pre-build arrow/parquet lib from ARROW_HOME parsed by --arrow_home to decrease build time. -Gluten cpp module need a required VELOX_HOME parsed by --velox_home and an optional ARROW_HOME by --arrow_home, if you specify custom ep location, make sure these variables be passed correctly. +Velox provides arrow/parquet lib. Gluten cpp module need a required VELOX_HOME parsed by --velox_home, if you specify custom ep location, make sure these variables be passed correctly. ```bash -## fetch Arrow and compile -cd /path/to/gluten/ep/build-arrow/src/ -## you could use custom ep location by --arrow_home=custom_path, make sure specify --arrow_home in build_arrow.sh too. -./get_arrow.sh -./build_arrow.sh - ## fetch Velox and compile cd /path/to/gluten/ep/build-velox/src/ ## you could use custom ep location by --velox_home=custom_path, make sure specify --velox_home in build_velox.sh too. From a7275486a840a8d1bbf5e4e66483266143874f07 Mon Sep 17 00:00:00 2001 From: Surbhi Vijayvargeeya Date: Tue, 31 Oct 2023 14:05:47 +0530 Subject: [PATCH 2/7] Correct formatting --- cpp/compile.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/compile.sh b/cpp/compile.sh index 62c32830c7fb..e3733bdf755a 100755 --- a/cpp/compile.sh +++ b/cpp/compile.sh @@ -99,7 +99,7 @@ CURRENT_DIR=$( #gluten cpp will find arrow/parquet lib from ARROW_HOME if [ "$ARROW_HOME" == "" ]; then - if ["$BUILD_TYPE" == "Debug"] then + if [ "$BUILD_TYPE" == "Debug" ]; then ARROW_HOME="$CURRENT_DIR/../ep/build-velox/build/velox_ep/_build/debug/third_party/arrow_ep" else ARROW_HOME="$CURRENT_DIR/../ep/build-velox/build/velox_ep/_build/release/third_party/arrow_ep" From 0184c52863f57cebe20a064e21e7acc80186de1c Mon Sep 17 00:00:00 2001 From: Surbhi Vijayvargeeya Date: Thu, 2 Nov 2023 16:09:46 +0530 Subject: [PATCH 3/7] Removed --arrow_home --- .github/workflows/velox_be.yml | 19 +++++++------- cpp/compile.sh | 16 ------------ docs/get-started/GlutenUsage.md | 46 +++++++++++---------------------- docs/get-started/Velox.md | 8 +++--- 4 files changed, 28 insertions(+), 61 deletions(-) diff --git a/.github/workflows/velox_be.yml b/.github/workflows/velox_be.yml index bd16547f5163..70f2131e2aea 100644 --- a/.github/workflows/velox_be.yml +++ b/.github/workflows/velox_be.yml @@ -28,7 +28,6 @@ on: - 'shims/**' - 'tools/gluten-it/**' - 'tools/gluten-te/**' - - 'ep/build-arrow/**' - 'ep/build-velox/**' - 'cpp/*' - 'cpp/CMake/**' @@ -62,7 +61,7 @@ jobs: run: | docker exec ubuntu2004-test-$GITHUB_RUN_ID bash -c ' cd /opt/gluten/cpp && \ - ./compile.sh --build_velox_backend=ON --velox_home=/opt/velox --arrow_home=/opt/velox/_build/release/third_party/arrow_ep --build_tests=ON --build_examples=ON --build_benchmarks=ON' + ./compile.sh --build_velox_backend=ON --velox_home=/opt/velox --build_tests=ON --build_examples=ON --build_benchmarks=ON' - name: Run CPP unit test run: | docker exec ubuntu2004-test-$GITHUB_RUN_ID bash -c 'cd /opt/gluten/cpp/build && \ @@ -108,7 +107,7 @@ jobs: run: | docker exec ubuntu2004-test-slow-$GITHUB_RUN_ID bash -c ' cd /opt/gluten/cpp && \ - ./compile.sh --build_velox_backend=ON --velox_home=/opt/velox --arrow_home=/opt/velox/_build/release/third_party/arrow_ep' + ./compile.sh --build_velox_backend=ON --velox_home=/opt/velox' - name: Build and run unit test for Spark 3.2.2(slow tests) run: | docker exec ubuntu2004-test-slow-$GITHUB_RUN_ID bash -c ' @@ -146,7 +145,7 @@ jobs: run: | docker exec ubuntu2004-test-spark33-slow-$GITHUB_RUN_ID bash -l -c ' cd /opt/gluten/cpp && \ - ./compile.sh --build_velox_backend=ON --velox_home=/opt/velox --arrow_home=/opt/velox/_build/release/third_party/arrow_ep' + ./compile.sh --build_velox_backend=ON --velox_home=/opt/velox' - name: Build and Run unit test for Spark 3.3.1(slow tests) run: | docker exec ubuntu2004-test-spark33-slow-$GITHUB_RUN_ID bash -l -c 'cd /opt/gluten && \ @@ -183,7 +182,7 @@ jobs: run: | docker exec ubuntu2004-test-spark33-$GITHUB_RUN_ID bash -c ' cd /opt/gluten/cpp && \ - ./compile.sh --build_velox_backend=ON --velox_home=/opt/velox --arrow_home=/opt/velox/_build/release/third_party/arrow_ep --build_examples=ON' + ./compile.sh --build_velox_backend=ON --velox_home=/opt/velox --build_examples=ON' - name: Build and Run unit test for Spark 3.3.1(other tests) run: | docker exec ubuntu2004-test-spark33-$GITHUB_RUN_ID bash -c 'cd /opt/gluten && \ @@ -213,7 +212,7 @@ jobs: run: | docker exec ubuntu2004-test-spark34-slow-$GITHUB_RUN_ID bash -l -c ' cd /opt/gluten/cpp && \ - ./compile.sh --build_velox_backend=ON --velox_home=/opt/velox --arrow_home=/opt/velox/_build/release/third_party/arrow_ep' + ./compile.sh --build_velox_backend=ON --velox_home=/opt/velox ' - name: Build and Run unit test for Spark 3.4.1(slow tests) run: | docker exec ubuntu2004-test-spark34-slow-$GITHUB_RUN_ID bash -l -c 'cd /opt/gluten && \ @@ -250,7 +249,7 @@ jobs: run: | docker exec ubuntu2004-test-spark34-$GITHUB_RUN_ID bash -c ' cd /opt/gluten/cpp && \ - ./compile.sh --build_velox_backend=ON --velox_home=/opt/velox --arrow_home=/opt/velox/_build/release/third_party/arrow_ep --build_examples=ON' + ./compile.sh --build_velox_backend=ON --velox_home=/opt/velox --build_examples=ON' - name: Build and Run unit test for Spark 3.4.1(other tests) run: | docker exec ubuntu2004-test-spark34-$GITHUB_RUN_ID bash -c 'cd /opt/gluten && \ @@ -280,7 +279,7 @@ jobs: run: | docker exec ubuntu2204-test-$GITHUB_RUN_ID bash -c ' cd /opt/gluten/cpp && \ - ./compile.sh --build_velox_backend=ON --velox_home=/opt/velox --arrow_home=/opt/velox/_build/release/third_party/arrow_ep --enable_hdfs=ON --enable_s3=ON' + ./compile.sh --build_velox_backend=ON --velox_home=/opt/velox --enable_hdfs=ON --enable_s3=ON' - name: Build for Spark 3.2.2 run: | docker exec ubuntu2204-test-$GITHUB_RUN_ID bash -c ' @@ -361,7 +360,7 @@ jobs: docker exec centos8-test-$GITHUB_RUN_ID bash -c ' source /env.sh && \ cd /opt/gluten/cpp && \ - ./compile.sh --build_velox_backend=ON --velox_home=/opt/velox --arrow_home=/opt/velox/_build/release/third_party/arrow_ep --enable_hdfs=ON --enable_s3=ON' + ./compile.sh --build_velox_backend=ON --velox_home=/opt/velox --enable_hdfs=ON --enable_s3=ON' - name: Build for Spark 3.2.2 run: | docker exec centos8-test-$GITHUB_RUN_ID bash -c ' @@ -410,7 +409,7 @@ jobs: docker exec centos7-test-$GITHUB_RUN_ID bash -c ' source /env.sh && \ cd /opt/gluten/cpp && \ - ./compile.sh --build_velox_backend=ON --velox_home=/opt/velox --arrow_home=/opt/velox/_build/release/third_party/arrow_ep' + ./compile.sh --build_velox_backend=ON --velox_home=/opt/velox' - name: Build for Spark 3.2.2 run: | docker exec centos7-test-$GITHUB_RUN_ID bash -c ' diff --git a/cpp/compile.sh b/cpp/compile.sh index e3733bdf755a..4f2ec01150d5 100755 --- a/cpp/compile.sh +++ b/cpp/compile.sh @@ -27,16 +27,11 @@ ENABLE_QAT=OFF ENABLE_HBM=OFF ENABLE_S3=OFF ENABLE_HDFS=OFF -ARROW_HOME= VELOX_HOME= NPROC=$(nproc --ignore=2) for arg in "$@"; do case $arg in - --arrow_home=*) - ARROW_HOME=("${arg#*=}") - shift # Remove argument name from processing - ;; --velox_home=*) VELOX_HOME=("${arg#*=}") shift # Remove argument name from processing @@ -97,15 +92,6 @@ CURRENT_DIR=$( pwd ) -#gluten cpp will find arrow/parquet lib from ARROW_HOME -if [ "$ARROW_HOME" == "" ]; then - if [ "$BUILD_TYPE" == "Debug" ]; then - ARROW_HOME="$CURRENT_DIR/../ep/build-velox/build/velox_ep/_build/debug/third_party/arrow_ep" - else - ARROW_HOME="$CURRENT_DIR/../ep/build-velox/build/velox_ep/_build/release/third_party/arrow_ep" - fi -fi - #gluten cpp will find velox lib from VELOX_HOME if [ "$VELOX_HOME" == "" ]; then VELOX_HOME="$CURRENT_DIR/../ep/build-velox/build/velox_ep" @@ -113,7 +99,6 @@ fi echo "Building gluten cpp part..." echo "CMAKE Arguments:" -echo "ARROW_HOME=${ARROW_HOME}" echo "VELOX_HOME=${VELOX_HOME}" echo "BUILD_TYPE=${BUILD_TYPE}" echo "BUILD_VELOX_BACKEND=${BUILD_VELOX_BACKEND}" @@ -134,7 +119,6 @@ cd build cmake .. \ -DBUILD_TESTS=${BUILD_TESTS} \ -DBUILD_EXAMPLES=${BUILD_EXAMPLES} \ - -DARROW_HOME=${ARROW_HOME} \ -DBUILD_JEMALLOC=${BUILD_JEMALLOC} \ -DBUILD_VELOX_BACKEND=${BUILD_VELOX_BACKEND} \ -DVELOX_HOME=${VELOX_HOME} \ diff --git a/docs/get-started/GlutenUsage.md b/docs/get-started/GlutenUsage.md index 46247c6e9c3b..2b78d2b84530 100644 --- a/docs/get-started/GlutenUsage.md +++ b/docs/get-started/GlutenUsage.md @@ -8,37 +8,21 @@ parent: Getting-Started ## Parameters for buildbundle-veloxbe.sh or builddeps-veloxbe.sh Please set them via `--`, e.g. `--build_type=Release`. -| Parameters | Description | Default value | -|------------------|-------------------------------------------------------------|---------------| -| build_type | build type for arrow, velox & gluten cpp, CMAKE_BUILD_TYPE | Release | -| build_tests | build test code in cpp folder and arrow | OFF | -| build_benchmarks | build benchmark code in cpp folder and arrow | OFF | -| build_jemalloc | build with jemalloc | ON | -| build_protobuf | build protobuf lib | ON | -| enable_qat | enable QAT for shuffle data de/compression | OFF | -| enable_iaa | enable IAA for shuffle data de/compression | OFF | -| enable_hbm | enable HBM allocator | OFF | -| enable_s3 | build with s3 lib | OFF | -| enable_hdfs | build with hdfs lib | OFF | -| enable_ep_cache | enable caching for external project build (Arrow and Velox) | OFF | -| skip_build_ep | skip the build of external projects (arrow, velox) | OFF | -| enable_vcpkg | enable vcpkg for static build | OFF | - -## Parameters for get_arrow.sh -Please set them via `--`, e.g., `--enable_custom_codec=ON`. - -| Parameters | Description | Default value | -| ---------- | ----------- | ------------- | -| enable_custom_codec | Apply patch to plugin custom codec (used by QAT/IAA) in Arrow cpp IPC module. | OFF | - -## Parameters for build_arrow.sh -Please set them via `--`, e.g., `--arrow_home=/YOUR/PATH`. - -| Parameters | Description | Default value | -| ---------- | ----------- | ------------- | -| arrow_home | Arrow build path | GLUTEN_DIR/ep/build-arrow/build| -| build_type | ARROW build type, CMAKE_BUILD_TYPE | Release| -| build_tests | Build arrow with -DARROW_JSON=ON | OFF | +| Parameters | Description | Default value | +|------------------|-----------------------------------------------------|---------------| +| build_type | build type for velox & gluten cpp, CMAKE_BUILD_TYPE | Release | +| build_tests | build test code in cpp folder | OFF | +| build_benchmarks | build benchmark code in cpp folder | OFF | +| build_jemalloc | build with jemalloc | ON | +| build_protobuf | build protobuf lib | ON | +| enable_qat | enable QAT for shuffle data de/compression | OFF | +| enable_iaa | enable IAA for shuffle data de/compression | OFF | +| enable_hbm | enable HBM allocator | OFF | +| enable_s3 | build with s3 lib | OFF | +| enable_hdfs | build with hdfs lib | OFF | +| enable_ep_cache | enable caching for external project build (Velox) | OFF | +| skip_build_ep | skip the build of external projects (velox) | OFF | +| enable_vcpkg | enable vcpkg for static build | OFF | ## Parameters for build_velox.sh Please set them via `--`, e.g., `--velox_home=/YOUR/PATH`. diff --git a/docs/get-started/Velox.md b/docs/get-started/Velox.md index 8a032add2814..431d858df80b 100644 --- a/docs/get-started/Velox.md +++ b/docs/get-started/Velox.md @@ -4,7 +4,7 @@ title: Gluten with Velox Backend nav_order: 1 parent: Getting-Started --- -Currently, the mvn script can automatically fetch and build all dependency libraries incluing Velox and Arrow. Our nightly build still use Velox under oap-project. +Currently, the mvn script can automatically fetch and build all dependency libraries incluing Velox. Our nightly build still use Velox under oap-project. # Prerequisite @@ -61,7 +61,7 @@ cd /path/to/gluten ./dev/buildbundle-veloxbe.sh ## After a complete build, if you need to re-build the project and only some gluten code is changed, -## you can use the following command to skip building arrow, velox and protobuf. +## you can use the following command to skip building velox and protobuf. # ./dev/buildbundle-veloxbe.sh --skip_build_ep=ON --build_protobuf=OFF ``` @@ -86,12 +86,12 @@ Velox provides arrow/parquet lib. Gluten cpp module need a required VELOX_HOME p cd /path/to/gluten/ep/build-velox/src/ ## you could use custom ep location by --velox_home=custom_path, make sure specify --velox_home in build_velox.sh too. ./get_velox.sh -## make sure specify --arrow_home or --velox_home if you have specified it in get_xxx.sh. +## make sure specify --velox_home if you have specified it in get_xxx.sh. ./build_velox.sh ## compile Gluten cpp module cd /path/to/gluten/cpp -## if you use custom velox_home or arrow_home, make sure specified here by --arrow_home or --velox_home +## if you use custom velox_home, make sure specified here by --velox_home ./compile.sh --build_velox_backend=ON ## compile Gluten java module and create package jar From f198f1e4dda78e0de6172a93847b120587ad3fe2 Mon Sep 17 00:00:00 2001 From: Surbhi Vijayvargeeya Date: Thu, 2 Nov 2023 20:04:29 +0530 Subject: [PATCH 4/7] Use VELOX_HOME instead of GLUTEN_HOME while setting ARROW_HOME --- cpp/CMakeLists.txt | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 99ca3867de9a..88805c30eeed 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -54,12 +54,10 @@ option(ENABLE_ORC "Enable ORC" OFF) set(root_directory ${PROJECT_BINARY_DIR}) get_filename_component(GLUTEN_HOME ${CMAKE_SOURCE_DIR} DIRECTORY) -if (NOT DEFINED ARROW_HOME) - if (${CMAKE_BUILD_TYPE} STREQUAL "Debug") - set(ARROW_HOME ${GLUTEN_HOME}/ep/build-velox/build/velox_ep/_build/debug/third_party/arrow_ep) - else() - set(ARROW_HOME ${GLUTEN_HOME}/ep/build-velox/build/velox_ep/_build/release/third_party/arrow_ep) - endif() +if (${CMAKE_BUILD_TYPE} STREQUAL "Debug") + set(ARROW_HOME ${VELOX_HOME}/_build/debug/third_party/arrow_ep) +else() + set(ARROW_HOME ${VELOX_HOME}/_build/release/third_party/arrow_ep) endif() # From 1aa31f03e0daea2e92fc728a7c57bed2be6c978c Mon Sep 17 00:00:00 2001 From: Surbhi Vijayvargeeya Date: Thu, 2 Nov 2023 20:15:28 +0530 Subject: [PATCH 5/7] Removed section to manually install arrow dependencies --- docs/get-started/Velox.md | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/docs/get-started/Velox.md b/docs/get-started/Velox.md index 431d858df80b..0680a5a4c985 100644 --- a/docs/get-started/Velox.md +++ b/docs/get-started/Velox.md @@ -13,14 +13,7 @@ common OS and conda env deployment. Gluten builds with Spark3.2.x and Spark3.3.x now but only fully tested in CI with 3.2.2 and 3.3.1. We will add/update supported/tested versions according to the upstream changes. -Velox uses the script `scripts/setup-xxx.sh` to install all dependency libraries, but Arrow's dependency libraries are not installed. Velox also requires ninja for compilation. -So we need to install all of them manually. Also, we need to set up the `JAVA_HOME` env. Currently, **java 8** is required and the support for java 11/17 is not ready. - -```bash -## run as root -## install gcc and libraries to build arrow -apt-get update && apt-get install -y sudo locales wget tar tzdata git ccache cmake ninja-build build-essential llvm-11-dev clang-11 libiberty-dev libdwarf-dev libre2-dev libz-dev libssl-dev libboost-all-dev libcurl4-openssl-dev openjdk-8-jdk maven -``` +we need to set up the `JAVA_HOME` env. Currently, **java 8** is required and the support for java 11/17 is not ready. **For x86_64** From 58c0068fee23e3fe433867ad17b223f2619d5294 Mon Sep 17 00:00:00 2001 From: Surbhi Vijayvargeeya Date: Mon, 6 Nov 2023 11:47:16 +0530 Subject: [PATCH 6/7] Updated doc as per comment --- docs/get-started/Velox.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/get-started/Velox.md b/docs/get-started/Velox.md index 0680a5a4c985..00dde26c9b07 100644 --- a/docs/get-started/Velox.md +++ b/docs/get-started/Velox.md @@ -70,7 +70,7 @@ cd /path/to/gluten **Build Velox separately** -Scripts under `/path/to/gluten/ep/build-xxx/src` provide `get_xxx.sh` and `build_xxx.sh` to build Velox separately, you could use these scripts with custom repo/branch/location. +Scripts under `/path/to/gluten/ep/build-velox/src` provide `get_velox.sh` and `build_velox.sh` to build Velox separately, you could use these scripts with custom repo/branch/location. Velox provides arrow/parquet lib. Gluten cpp module need a required VELOX_HOME parsed by --velox_home, if you specify custom ep location, make sure these variables be passed correctly. @@ -79,7 +79,7 @@ Velox provides arrow/parquet lib. Gluten cpp module need a required VELOX_HOME p cd /path/to/gluten/ep/build-velox/src/ ## you could use custom ep location by --velox_home=custom_path, make sure specify --velox_home in build_velox.sh too. ./get_velox.sh -## make sure specify --velox_home if you have specified it in get_xxx.sh. +## make sure specify --velox_home if you have specified it in get_velox.sh. ./build_velox.sh ## compile Gluten cpp module From 3db954403ebd32f8df156cc99f6c5e2166b71844 Mon Sep 17 00:00:00 2001 From: Surbhi Vijayvargeeya Date: Tue, 7 Nov 2023 20:38:50 +0530 Subject: [PATCH 7/7] fix build error --- cpp/CMakeLists.txt | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 88805c30eeed..3cacdb0f4536 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -54,6 +54,11 @@ option(ENABLE_ORC "Enable ORC" OFF) set(root_directory ${PROJECT_BINARY_DIR}) get_filename_component(GLUTEN_HOME ${CMAKE_SOURCE_DIR} DIRECTORY) +if (NOT DEFINED VELOX_HOME) + set(VELOX_HOME ${GLUTEN_HOME}/ep/build-velox/build/velox_ep) + message(STATUS "Set VELOX_HOME to ${VELOX_HOME}") +endif() + if (${CMAKE_BUILD_TYPE} STREQUAL "Debug") set(ARROW_HOME ${VELOX_HOME}/_build/debug/third_party/arrow_ep) else()