Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

refactor: Enable Parquet and Arrow by default #11832

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 1 addition & 2 deletions .github/workflows/build-metrics.yml
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@ jobs:
shell: bash
env:
VELOX_DEPENDENCY_SOURCE: SYSTEM
GTest_SOURCE: BUNDLED
simdjson_SOURCE: BUNDLED
xsimd_SOURCE: BUNDLED
steps:
Expand All @@ -67,8 +68,6 @@ jobs:
run: |
EXTRA_CMAKE_FLAGS=(
"-DVELOX_ENABLE_BENCHMARKS=ON"
"-DVELOX_ENABLE_ARROW=ON"
"-DVELOX_ENABLE_PARQUET=ON"
"-DVELOX_ENABLE_HDFS=ON"
"-DVELOX_ENABLE_S3=ON"
"-DVELOX_ENABLE_GCS=ON"
Expand Down
4 changes: 2 additions & 2 deletions .github/workflows/experimental.yml
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ jobs:
- name: "Build"
run: |
cd velox
make debug NUM_THREADS="${{ inputs.numThreads || 8 }}" MAX_HIGH_MEM_JOBS="${{ inputs.maxHighMemJobs || 8 }}" MAX_LINK_JOBS="${{ inputs.maxLinkJobs || 4 }}" EXTRA_CMAKE_FLAGS="-DVELOX_ENABLE_ARROW=ON ${{ inputs.extraCMakeFlags }}"
make debug NUM_THREADS="${{ inputs.numThreads || 8 }}" MAX_HIGH_MEM_JOBS="${{ inputs.maxHighMemJobs || 8 }}" MAX_LINK_JOBS="${{ inputs.maxLinkJobs || 4 }}" EXTRA_CMAKE_FLAGS="${{ inputs.extraCMakeFlags }}"
ccache -s

- name: Upload aggregation fuzzer
Expand Down Expand Up @@ -146,7 +146,7 @@ jobs:
run: |
cd velox
source /opt/rh/gcc-toolset-12/enable
make debug NUM_THREADS="${{ inputs.numThreads || 8 }}" MAX_HIGH_MEM_JOBS="${{ inputs.maxHighMemJobs || 8 }}" MAX_LINK_JOBS="${{ inputs.maxLinkJobs || 4 }}" EXTRA_CMAKE_FLAGS="-DVELOX_ENABLE_ARROW=ON ${{ inputs.extraCMakeFlags }}"
make debug NUM_THREADS="${{ inputs.numThreads || 8 }}" MAX_HIGH_MEM_JOBS="${{ inputs.maxHighMemJobs || 8 }}" MAX_LINK_JOBS="${{ inputs.maxLinkJobs || 4 }}" EXTRA_CMAKE_FLAGS="${{ inputs.extraCMakeFlags }}"
ccache -s

- name: "Run Aggregate Fuzzer"
Expand Down
3 changes: 0 additions & 3 deletions .github/workflows/linux-build-base.yml
Original file line number Diff line number Diff line change
Expand Up @@ -86,8 +86,6 @@ jobs:
run: |
EXTRA_CMAKE_FLAGS=(
"-DVELOX_ENABLE_BENCHMARKS=ON"
"-DVELOX_ENABLE_ARROW=ON"
"-DVELOX_ENABLE_PARQUET=ON"
"-DVELOX_ENABLE_HDFS=ON"
"-DVELOX_ENABLE_S3=ON"
"-DVELOX_ENABLE_GCS=ON"
Expand Down Expand Up @@ -160,7 +158,6 @@ jobs:
VELOX_DEPENDENCY_SOURCE: BUNDLED
ICU_SOURCE: SYSTEM
MAKEFLAGS: "NUM_THREADS=8 MAX_HIGH_MEM_JOBS=4 MAX_LINK_JOBS=3"
EXTRA_CMAKE_FLAGS: "-DVELOX_ENABLE_ARROW=ON -DVELOX_ENABLE_PARQUET=ON"
run: |
if [[ "${USE_CLANG}" = "true" ]]; then export CC=/usr/bin/clang-15; export CXX=/usr/bin/clang++-15; fi
make debug
Expand Down
1 change: 0 additions & 1 deletion .github/workflows/macos.yml
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,6 @@ jobs:
-GNinja \
-DTREAT_WARNINGS_AS_ERRORS=1 \
-DENABLE_ALL_WARNINGS=1 \
-DVELOX_ENABLE_PARQUET=ON \
-DVELOX_MONO_LIBRARY=ON \
-DVELOX_BUILD_SHARED=ON \
-DCMAKE_BUILD_TYPE=$BUILD_TYPE
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/scheduled.yml
Original file line number Diff line number Diff line change
Expand Up @@ -198,7 +198,7 @@ jobs:

- name: Build
env:
EXTRA_CMAKE_FLAGS: "-DVELOX_ENABLE_ARROW=ON -DVELOX_BUILD_PYTHON_PACKAGE=ON ${{ inputs.extraCMakeFlags }}"
EXTRA_CMAKE_FLAGS: "-DVELOX_BUILD_PYTHON_PACKAGE=ON ${{ inputs.extraCMakeFlags }}"
run: |
EXTRA_CMAKE_FLAGS="-DPYTHON_EXECUTABLE=$(which python3) $EXTRA_CMAKE_FLAGS"
make debug
Expand Down
145 changes: 72 additions & 73 deletions CMake/resolve_dependency_modules/arrow/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -13,84 +13,83 @@
# limitations under the License.
project(Arrow)

if(VELOX_ENABLE_ARROW)
find_package(Thrift)
if(Thrift_FOUND)
set(THRIFT_SOURCE "SYSTEM")
else()
set(THRIFT_SOURCE "BUNDLED")
endif()

set(ARROW_PREFIX "${CMAKE_CURRENT_BINARY_DIR}/arrow_ep")
set(ARROW_CMAKE_ARGS
-DARROW_PARQUET=OFF
-DARROW_DEPENDENCY_SOURCE=AUTO
-DARROW_WITH_THRIFT=ON
-DARROW_WITH_LZ4=ON
-DARROW_WITH_SNAPPY=ON
-DARROW_WITH_ZLIB=ON
-DARROW_WITH_ZSTD=ON
-DARROW_JEMALLOC=OFF
-DARROW_SIMD_LEVEL=NONE
-DARROW_RUNTIME_SIMD_LEVEL=NONE
-DARROW_WITH_UTF8PROC=OFF
-DARROW_TESTING=ON
-DCMAKE_INSTALL_PREFIX=${ARROW_PREFIX}/install
-DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}
-DARROW_BUILD_STATIC=ON
-DThrift_SOURCE=${THRIFT_SOURCE}
-DCMAKE_PREFIX_PATH=${CMAKE_PREFIX_PATH})
set(ARROW_LIBDIR ${ARROW_PREFIX}/install/${CMAKE_INSTALL_LIBDIR})
find_package(Thrift)
if(Thrift_FOUND)
set(THRIFT_SOURCE "SYSTEM")
else()
set(THRIFT_SOURCE "BUNDLED")
endif()

add_library(thrift STATIC IMPORTED GLOBAL)
if(NOT Thrift_FOUND)
set(THRIFT_ROOT ${ARROW_PREFIX}/src/arrow_ep-build/thrift_ep-install)
set(THRIFT_LIB ${THRIFT_ROOT}/lib/libthrift.a)
set(ARROW_PREFIX "${CMAKE_CURRENT_BINARY_DIR}/arrow_ep")
set(ARROW_CMAKE_ARGS
-DARROW_PARQUET=OFF
-DARROW_DEPENDENCY_SOURCE=AUTO
-DARROW_WITH_THRIFT=ON
-DARROW_WITH_LZ4=ON
-DARROW_WITH_SNAPPY=ON
-DARROW_WITH_ZLIB=ON
-DARROW_WITH_ZSTD=ON
-DARROW_JEMALLOC=OFF
-DARROW_SIMD_LEVEL=NONE
-DARROW_RUNTIME_SIMD_LEVEL=NONE
-DARROW_WITH_UTF8PROC=OFF
-DARROW_TESTING=ON
-DCMAKE_INSTALL_PREFIX=${ARROW_PREFIX}/install
-DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}
-DARROW_BUILD_STATIC=ON
-DThrift_SOURCE=${THRIFT_SOURCE}
-DCMAKE_PREFIX_PATH=${CMAKE_PREFIX_PATH})
set(ARROW_LIBDIR ${ARROW_PREFIX}/install/${CMAKE_INSTALL_LIBDIR})

file(MAKE_DIRECTORY ${THRIFT_ROOT}/include)
set(THRIFT_INCLUDE_DIR ${THRIFT_ROOT}/include)
endif()
add_library(thrift STATIC IMPORTED GLOBAL)
if(NOT Thrift_FOUND)
set(THRIFT_ROOT ${ARROW_PREFIX}/src/arrow_ep-build/thrift_ep-install)
set(THRIFT_LIB ${THRIFT_ROOT}/lib/libthrift.a)

set_property(TARGET thrift PROPERTY INTERFACE_INCLUDE_DIRECTORIES
${THRIFT_INCLUDE_DIR})
set_property(TARGET thrift PROPERTY IMPORTED_LOCATION ${THRIFT_LIB})
file(MAKE_DIRECTORY ${THRIFT_ROOT}/include)
set(THRIFT_INCLUDE_DIR ${THRIFT_ROOT}/include)
endif()

set(VELOX_ARROW_BUILD_VERSION 15.0.0)
set(VELOX_ARROW_BUILD_SHA256_CHECKSUM
01dd3f70e85d9b5b933ec92c0db8a4ef504a5105f78d2d8622e84279fb45c25d)
set(VELOX_ARROW_SOURCE_URL
"https://archive.apache.org/dist/arrow/arrow-${VELOX_ARROW_BUILD_VERSION}/apache-arrow-${VELOX_ARROW_BUILD_VERSION}.tar.gz"
)
set_property(TARGET thrift PROPERTY INTERFACE_INCLUDE_DIRECTORIES
${THRIFT_INCLUDE_DIR})
set_property(TARGET thrift PROPERTY IMPORTED_LOCATION ${THRIFT_LIB})

velox_resolve_dependency_url(ARROW)
set(VELOX_ARROW_BUILD_VERSION 15.0.0)
set(VELOX_ARROW_BUILD_SHA256_CHECKSUM
01dd3f70e85d9b5b933ec92c0db8a4ef504a5105f78d2d8622e84279fb45c25d)
set(VELOX_ARROW_SOURCE_URL
"https://archive.apache.org/dist/arrow/arrow-${VELOX_ARROW_BUILD_VERSION}/apache-arrow-${VELOX_ARROW_BUILD_VERSION}.tar.gz"
)

ExternalProject_Add(
arrow_ep
PREFIX ${ARROW_PREFIX}
URL ${VELOX_ARROW_SOURCE_URL}
URL_HASH ${VELOX_ARROW_BUILD_SHA256_CHECKSUM}
SOURCE_SUBDIR cpp
CMAKE_ARGS ${ARROW_CMAKE_ARGS}
BUILD_BYPRODUCTS ${ARROW_LIBDIR}/libarrow.a ${ARROW_LIBDIR}/libparquet.a
${ARROW_LIBDIR}/libarrow_testing.a ${THRIFT_LIB})
velox_resolve_dependency_url(ARROW)

add_library(arrow STATIC IMPORTED GLOBAL)
add_library(arrow_testing STATIC IMPORTED GLOBAL)
add_library(parquet STATIC IMPORTED GLOBAL)
add_dependencies(arrow arrow_ep)
add_dependencies(arrow_testing arrow)
add_dependencies(parquet arrow)
file(MAKE_DIRECTORY ${ARROW_PREFIX}/install/include)
set_target_properties(
arrow arrow_testing parquet PROPERTIES INTERFACE_INCLUDE_DIRECTORIES
${ARROW_PREFIX}/install/include)
set_target_properties(arrow PROPERTIES IMPORTED_LOCATION
${ARROW_LIBDIR}/libarrow.a)
set_property(TARGET arrow PROPERTY INTERFACE_LINK_LIBRARIES ${RE2} thrift)
set_target_properties(
arrow_testing PROPERTIES IMPORTED_LOCATION
${ARROW_LIBDIR}/libarrow_testing.a)
set_target_properties(parquet PROPERTIES IMPORTED_LOCATION
${ARROW_LIBDIR}/libparquet.a)
ExternalProject_Add(
arrow_ep
PREFIX ${ARROW_PREFIX}
URL ${VELOX_ARROW_SOURCE_URL}
URL_HASH ${VELOX_ARROW_BUILD_SHA256_CHECKSUM}
SOURCE_SUBDIR cpp
CMAKE_ARGS ${ARROW_CMAKE_ARGS}
BUILD_BYPRODUCTS ${ARROW_LIBDIR}/libarrow.a ${ARROW_LIBDIR}/libparquet.a
${ARROW_LIBDIR}/libarrow_testing.a ${THRIFT_LIB})

endif()
add_library(arrow STATIC IMPORTED GLOBAL)
add_library(arrow_testing STATIC IMPORTED GLOBAL)
add_library(parquet STATIC IMPORTED GLOBAL)
add_dependencies(arrow arrow_ep)
add_dependencies(arrow_testing arrow)
add_dependencies(parquet arrow)
file(MAKE_DIRECTORY ${ARROW_PREFIX}/install/include)
set_target_properties(
arrow arrow_testing parquet
PROPERTIES INTERFACE_INCLUDE_DIRECTORIES ${ARROW_PREFIX}/install/include)
set_target_properties(
arrow
PROPERTIES IMPORTED_LOCATION ${ARROW_LIBDIR}/libarrow.a)
set_property(TARGET arrow PROPERTY INTERFACE_LINK_LIBRARIES ${RE2} thrift)
set_target_properties(
arrow_testing
PROPERTIES IMPORTED_LOCATION ${ARROW_LIBDIR}/libarrow_testing.a)
set_target_properties(
parquet
PROPERTIES IMPORTED_LOCATION ${ARROW_LIBDIR}/libparquet.a)
19 changes: 4 additions & 15 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -136,8 +136,6 @@ option(VELOX_ENABLE_S3 "Build S3 Connector" OFF)
option(VELOX_ENABLE_GCS "Build GCS Connector" OFF)
option(VELOX_ENABLE_ABFS "Build Abfs Connector" OFF)
option(VELOX_ENABLE_HDFS "Build Hdfs Connector" OFF)
option(VELOX_ENABLE_PARQUET "Enable Parquet support" OFF)
option(VELOX_ENABLE_ARROW "Enable Arrow support" OFF)
option(VELOX_ENABLE_REMOTE_FUNCTIONS "Enable remote function support" OFF)
option(VELOX_ENABLE_CCACHE "Use ccache if installed." ON)

Expand Down Expand Up @@ -183,7 +181,6 @@ if(${VELOX_BUILD_TESTING})
set(VELOX_ENABLE_TPCH_CONNECTOR ON)
set(VELOX_ENABLE_SPARK_FUNCTIONS ON)
set(VELOX_ENABLE_EXAMPLES ON)
set(VELOX_ENABLE_PARQUET ON)
endif()

if(${VELOX_ENABLE_BENCHMARKS})
Expand Down Expand Up @@ -280,14 +277,6 @@ endif()
if(VELOX_ENABLE_HDFS)
add_definitions(-DVELOX_ENABLE_HDFS)
# JVM libhdfs requires arrow dependency.
set(VELOX_ENABLE_ARROW ON)
endif()

if(VELOX_ENABLE_PARQUET)
add_definitions(-DVELOX_ENABLE_PARQUET)
# Native Parquet reader requires Apache Thrift and Arrow Parquet writer, which
# are included in Arrow.
set(VELOX_ENABLE_ARROW ON)
endif()

# make buildPartitionBounds_ a vector int64 instead of int32 to avoid integer
Expand Down Expand Up @@ -633,9 +622,9 @@ if("${TREAT_WARNINGS_AS_ERRORS}")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Werror")
endif()

if(VELOX_ENABLE_ARROW)
velox_set_source(Arrow)
velox_resolve_dependency(Arrow)
endif()
# Native Parquet reader requires Apache Thrift and Arrow Parquet writer, which
# are included in Arrow.
velox_set_source(Arrow)
velox_resolve_dependency(Arrow)

add_subdirectory(velox)
3 changes: 1 addition & 2 deletions docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -52,8 +52,7 @@ services:
environment:
NUM_THREADS: 8 # default value for NUM_THREADS
CCACHE_DIR: "/velox/.ccache"
EXTRA_CMAKE_FLAGS: -DVELOX_ENABLE_PARQUET=ON
-DVELOX_ENABLE_S3=ON
EXTRA_CMAKE_FLAGS: -DVELOX_ENABLE_S3=ON
volumes:
- .:/velox:delegated
working_dir: /velox
Expand Down
8 changes: 0 additions & 8 deletions velox/connectors/hive/HiveConnectorUtil.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,11 +26,7 @@
#include "velox/dwio/common/Reader.h"
#include "velox/dwio/dwrf/common/Config.h"
#include "velox/dwio/dwrf/writer/Writer.h"

#ifdef VELOX_ENABLE_PARQUET
#include "velox/dwio/parquet/writer/Writer.h" // @manual
#endif

#include "velox/expression/Expr.h"
#include "velox/expression/ExprToSubfieldFilter.h"
#include "velox/type/TimestampConversion.h"
Expand Down Expand Up @@ -905,7 +901,6 @@ core::TypedExprPtr extractFiltersFromRemainingFilter(

namespace {

#ifdef VELOX_ENABLE_PARQUET
std::optional<TimestampUnit> getTimestampUnit(
const config::ConfigBase& config,
const char* configKey) {
Expand Down Expand Up @@ -966,7 +961,6 @@ void updateParquetWriterOptions(

writerOptions = std::move(parquetWriterOptions);
}
#endif

void updateDWRFWriterOptions(
const std::shared_ptr<const HiveConfig>& hiveConfig,
Expand Down Expand Up @@ -1038,9 +1032,7 @@ void updateWriterOptionsFromHiveConfig(
updateDWRFWriterOptions(hiveConfig, sessionProperties, writerOptions);
break;
case dwio::common::FileFormat::PARQUET:
#ifdef VELOX_ENABLE_PARQUET
updateParquetWriterOptions(hiveConfig, sessionProperties, writerOptions);
#endif
break;
case dwio::common::FileFormat::NIMBLE:
// No-op for now.
Expand Down
18 changes: 6 additions & 12 deletions velox/connectors/hive/tests/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -26,23 +26,17 @@ add_executable(
TableHandleTest.cpp)
add_test(velox_hive_connector_test velox_hive_connector_test)

target_include_directories(velox_hive_connector_test
PUBLIC ${ARROW_PREFIX}/install/include)
target_link_libraries(
velox_hive_connector_test
velox_dwio_common_exception
velox_dwio_native_parquet_reader
velox_exec
velox_exec_test_lib
velox_hive_connector
velox_hive_partition_function
velox_dwio_common_exception
velox_vector_fuzzer
velox_vector_test_lib
velox_exec
velox_exec_test_lib
GTest::gtest
GTest::gtest_main)

if(VELOX_ENABLE_PARQUET)

target_include_directories(velox_hive_connector_test
PUBLIC ${ARROW_PREFIX}/install/include)
target_link_libraries(velox_hive_connector_test
velox_dwio_native_parquet_reader)

endif()
8 changes: 1 addition & 7 deletions velox/connectors/hive/tests/HiveConnectorUtilTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,13 +19,9 @@
#include "velox/connectors/hive/HiveConfig.h"
#include "velox/connectors/hive/HiveConnectorSplit.h"
#include "velox/connectors/hive/TableHandle.h"
#include "velox/exec/tests/utils/HiveConnectorTestBase.h"

#include "velox/dwio/dwrf/writer/Writer.h"

#ifdef VELOX_ENABLE_PARQUET
#include "velox/dwio/parquet/writer/Writer.h"
#endif
#include "velox/exec/tests/utils/HiveConnectorTestBase.h"

namespace facebook::velox::connector {

Expand Down Expand Up @@ -403,7 +399,6 @@ TEST_F(
"2");
}

#ifdef VELOX_ENABLE_PARQUET
TEST_F(HiveConnectorUtilTest, updateWriterOptionsFromHiveConfigParquet) {
auto fileFormat = dwio::common::FileFormat::PARQUET;
std::unordered_map<std::string, std::string> connectorConfig = {
Expand All @@ -427,6 +422,5 @@ TEST_F(HiveConnectorUtilTest, updateWriterOptionsFromHiveConfigParquet) {
parquetOptions->parquetWriteTimestampUnit.value(), TimestampUnit::kMilli);
ASSERT_EQ(parquetOptions->parquetWriteTimestampTimeZone.value(), "UTC");
}
#endif

} // namespace facebook::velox::connector
Loading
Loading