Skip to content

Commit

Permalink
refactor: Enable Parquet and Arrow by default
Browse files Browse the repository at this point in the history
  • Loading branch information
zuyu committed Dec 12, 2024
1 parent a775a6c commit b342ac0
Show file tree
Hide file tree
Showing 18 changed files with 131 additions and 200 deletions.
4 changes: 2 additions & 2 deletions .github/workflows/experimental.yml
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ jobs:
- name: "Build"
run: |
cd velox
make debug NUM_THREADS="${{ inputs.numThreads || 8 }}" MAX_HIGH_MEM_JOBS="${{ inputs.maxHighMemJobs || 8 }}" MAX_LINK_JOBS="${{ inputs.maxLinkJobs || 4 }}" EXTRA_CMAKE_FLAGS="-DVELOX_ENABLE_ARROW=ON ${{ inputs.extraCMakeFlags }}"
make debug NUM_THREADS="${{ inputs.numThreads || 8 }}" MAX_HIGH_MEM_JOBS="${{ inputs.maxHighMemJobs || 8 }}" MAX_LINK_JOBS="${{ inputs.maxLinkJobs || 4 }}" EXTRA_CMAKE_FLAGS="${{ inputs.extraCMakeFlags }}"
ccache -s
- name: Upload aggregation fuzzer
Expand Down Expand Up @@ -146,7 +146,7 @@ jobs:
run: |
cd velox
source /opt/rh/gcc-toolset-12/enable
make debug NUM_THREADS="${{ inputs.numThreads || 8 }}" MAX_HIGH_MEM_JOBS="${{ inputs.maxHighMemJobs || 8 }}" MAX_LINK_JOBS="${{ inputs.maxLinkJobs || 4 }}" EXTRA_CMAKE_FLAGS="-DVELOX_ENABLE_ARROW=ON ${{ inputs.extraCMakeFlags }}"
make debug NUM_THREADS="${{ inputs.numThreads || 8 }}" MAX_HIGH_MEM_JOBS="${{ inputs.maxHighMemJobs || 8 }}" MAX_LINK_JOBS="${{ inputs.maxLinkJobs || 4 }}" EXTRA_CMAKE_FLAGS="${{ inputs.extraCMakeFlags }}"
ccache -s
- name: "Run Aggregate Fuzzer"
Expand Down
1 change: 0 additions & 1 deletion .github/workflows/macos.yml
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,6 @@ jobs:
-GNinja \
-DTREAT_WARNINGS_AS_ERRORS=1 \
-DENABLE_ALL_WARNINGS=1 \
-DVELOX_ENABLE_PARQUET=ON \
-DVELOX_MONO_LIBRARY=ON \
-DVELOX_BUILD_SHARED=ON \
-DCMAKE_BUILD_TYPE=$BUILD_TYPE
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/scheduled.yml
Original file line number Diff line number Diff line change
Expand Up @@ -198,7 +198,7 @@ jobs:
- name: Build
env:
EXTRA_CMAKE_FLAGS: "-DVELOX_ENABLE_ARROW=ON -DVELOX_BUILD_PYTHON_PACKAGE=ON ${{ inputs.extraCMakeFlags }}"
EXTRA_CMAKE_FLAGS: "-DVELOX_BUILD_PYTHON_PACKAGE=ON ${{ inputs.extraCMakeFlags }}"
run: |
EXTRA_CMAKE_FLAGS="-DPYTHON_EXECUTABLE=$(which python3) $EXTRA_CMAKE_FLAGS"
make debug
Expand Down
145 changes: 72 additions & 73 deletions CMake/resolve_dependency_modules/arrow/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -13,84 +13,83 @@
# limitations under the License.
project(Arrow)

if(VELOX_ENABLE_ARROW)
find_package(Thrift)
if(Thrift_FOUND)
set(THRIFT_SOURCE "SYSTEM")
else()
set(THRIFT_SOURCE "BUNDLED")
endif()

set(ARROW_PREFIX "${CMAKE_CURRENT_BINARY_DIR}/arrow_ep")
set(ARROW_CMAKE_ARGS
-DARROW_PARQUET=OFF
-DARROW_DEPENDENCY_SOURCE=AUTO
-DARROW_WITH_THRIFT=ON
-DARROW_WITH_LZ4=ON
-DARROW_WITH_SNAPPY=ON
-DARROW_WITH_ZLIB=ON
-DARROW_WITH_ZSTD=ON
-DARROW_JEMALLOC=OFF
-DARROW_SIMD_LEVEL=NONE
-DARROW_RUNTIME_SIMD_LEVEL=NONE
-DARROW_WITH_UTF8PROC=OFF
-DARROW_TESTING=ON
-DCMAKE_INSTALL_PREFIX=${ARROW_PREFIX}/install
-DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}
-DARROW_BUILD_STATIC=ON
-DThrift_SOURCE=${THRIFT_SOURCE}
-DCMAKE_PREFIX_PATH=${CMAKE_PREFIX_PATH})
set(ARROW_LIBDIR ${ARROW_PREFIX}/install/${CMAKE_INSTALL_LIBDIR})
find_package(Thrift)
if(Thrift_FOUND)
set(THRIFT_SOURCE "SYSTEM")
else()
set(THRIFT_SOURCE "BUNDLED")
endif()

add_library(thrift STATIC IMPORTED GLOBAL)
if(NOT Thrift_FOUND)
set(THRIFT_ROOT ${ARROW_PREFIX}/src/arrow_ep-build/thrift_ep-install)
set(THRIFT_LIB ${THRIFT_ROOT}/lib/libthrift.a)
set(ARROW_PREFIX "${CMAKE_CURRENT_BINARY_DIR}/arrow_ep")
set(ARROW_CMAKE_ARGS
-DARROW_PARQUET=OFF
-DARROW_DEPENDENCY_SOURCE=AUTO
-DARROW_WITH_THRIFT=ON
-DARROW_WITH_LZ4=ON
-DARROW_WITH_SNAPPY=ON
-DARROW_WITH_ZLIB=ON
-DARROW_WITH_ZSTD=ON
-DARROW_JEMALLOC=OFF
-DARROW_SIMD_LEVEL=NONE
-DARROW_RUNTIME_SIMD_LEVEL=NONE
-DARROW_WITH_UTF8PROC=OFF
-DARROW_TESTING=ON
-DCMAKE_INSTALL_PREFIX=${ARROW_PREFIX}/install
-DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}
-DARROW_BUILD_STATIC=ON
-DThrift_SOURCE=${THRIFT_SOURCE}
-DCMAKE_PREFIX_PATH=${CMAKE_PREFIX_PATH})
set(ARROW_LIBDIR ${ARROW_PREFIX}/install/${CMAKE_INSTALL_LIBDIR})

file(MAKE_DIRECTORY ${THRIFT_ROOT}/include)
set(THRIFT_INCLUDE_DIR ${THRIFT_ROOT}/include)
endif()
add_library(thrift STATIC IMPORTED GLOBAL)
if(NOT Thrift_FOUND)
set(THRIFT_ROOT ${ARROW_PREFIX}/src/arrow_ep-build/thrift_ep-install)
set(THRIFT_LIB ${THRIFT_ROOT}/lib/libthrift.a)

set_property(TARGET thrift PROPERTY INTERFACE_INCLUDE_DIRECTORIES
${THRIFT_INCLUDE_DIR})
set_property(TARGET thrift PROPERTY IMPORTED_LOCATION ${THRIFT_LIB})
file(MAKE_DIRECTORY ${THRIFT_ROOT}/include)
set(THRIFT_INCLUDE_DIR ${THRIFT_ROOT}/include)
endif()

set(VELOX_ARROW_BUILD_VERSION 15.0.0)
set(VELOX_ARROW_BUILD_SHA256_CHECKSUM
01dd3f70e85d9b5b933ec92c0db8a4ef504a5105f78d2d8622e84279fb45c25d)
set(VELOX_ARROW_SOURCE_URL
"https://archive.apache.org/dist/arrow/arrow-${VELOX_ARROW_BUILD_VERSION}/apache-arrow-${VELOX_ARROW_BUILD_VERSION}.tar.gz"
)
set_property(TARGET thrift PROPERTY INTERFACE_INCLUDE_DIRECTORIES
${THRIFT_INCLUDE_DIR})
set_property(TARGET thrift PROPERTY IMPORTED_LOCATION ${THRIFT_LIB})

velox_resolve_dependency_url(ARROW)
set(VELOX_ARROW_BUILD_VERSION 15.0.0)
set(VELOX_ARROW_BUILD_SHA256_CHECKSUM
01dd3f70e85d9b5b933ec92c0db8a4ef504a5105f78d2d8622e84279fb45c25d)
set(VELOX_ARROW_SOURCE_URL
"https://archive.apache.org/dist/arrow/arrow-${VELOX_ARROW_BUILD_VERSION}/apache-arrow-${VELOX_ARROW_BUILD_VERSION}.tar.gz"
)

ExternalProject_Add(
arrow_ep
PREFIX ${ARROW_PREFIX}
URL ${VELOX_ARROW_SOURCE_URL}
URL_HASH ${VELOX_ARROW_BUILD_SHA256_CHECKSUM}
SOURCE_SUBDIR cpp
CMAKE_ARGS ${ARROW_CMAKE_ARGS}
BUILD_BYPRODUCTS ${ARROW_LIBDIR}/libarrow.a ${ARROW_LIBDIR}/libparquet.a
${ARROW_LIBDIR}/libarrow_testing.a ${THRIFT_LIB})
velox_resolve_dependency_url(ARROW)

add_library(arrow STATIC IMPORTED GLOBAL)
add_library(arrow_testing STATIC IMPORTED GLOBAL)
add_library(parquet STATIC IMPORTED GLOBAL)
add_dependencies(arrow arrow_ep)
add_dependencies(arrow_testing arrow)
add_dependencies(parquet arrow)
file(MAKE_DIRECTORY ${ARROW_PREFIX}/install/include)
set_target_properties(
arrow arrow_testing parquet PROPERTIES INTERFACE_INCLUDE_DIRECTORIES
${ARROW_PREFIX}/install/include)
set_target_properties(arrow PROPERTIES IMPORTED_LOCATION
${ARROW_LIBDIR}/libarrow.a)
set_property(TARGET arrow PROPERTY INTERFACE_LINK_LIBRARIES ${RE2} thrift)
set_target_properties(
arrow_testing PROPERTIES IMPORTED_LOCATION
${ARROW_LIBDIR}/libarrow_testing.a)
set_target_properties(parquet PROPERTIES IMPORTED_LOCATION
${ARROW_LIBDIR}/libparquet.a)
ExternalProject_Add(
arrow_ep
PREFIX ${ARROW_PREFIX}
URL ${VELOX_ARROW_SOURCE_URL}
URL_HASH ${VELOX_ARROW_BUILD_SHA256_CHECKSUM}
SOURCE_SUBDIR cpp
CMAKE_ARGS ${ARROW_CMAKE_ARGS}
BUILD_BYPRODUCTS ${ARROW_LIBDIR}/libarrow.a ${ARROW_LIBDIR}/libparquet.a
${ARROW_LIBDIR}/libarrow_testing.a ${THRIFT_LIB})

endif()
add_library(arrow STATIC IMPORTED GLOBAL)
add_library(arrow_testing STATIC IMPORTED GLOBAL)
add_library(parquet STATIC IMPORTED GLOBAL)
add_dependencies(arrow arrow_ep)
add_dependencies(arrow_testing arrow)
add_dependencies(parquet arrow)
file(MAKE_DIRECTORY ${ARROW_PREFIX}/install/include)
set_target_properties(
arrow arrow_testing parquet
PROPERTIES INTERFACE_INCLUDE_DIRECTORIES ${ARROW_PREFIX}/install/include)
set_target_properties(
arrow
PROPERTIES IMPORTED_LOCATION ${ARROW_LIBDIR}/libarrow.a)
set_property(TARGET arrow PROPERTY INTERFACE_LINK_LIBRARIES ${RE2} thrift)
set_target_properties(
arrow_testing
PROPERTIES IMPORTED_LOCATION ${ARROW_LIBDIR}/libarrow_testing.a)
set_target_properties(
parquet
PROPERTIES IMPORTED_LOCATION ${ARROW_LIBDIR}/libparquet.a)
19 changes: 4 additions & 15 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -136,8 +136,6 @@ option(VELOX_ENABLE_S3 "Build S3 Connector" OFF)
option(VELOX_ENABLE_GCS "Build GCS Connector" OFF)
option(VELOX_ENABLE_ABFS "Build Abfs Connector" OFF)
option(VELOX_ENABLE_HDFS "Build Hdfs Connector" OFF)
option(VELOX_ENABLE_PARQUET "Enable Parquet support" OFF)
option(VELOX_ENABLE_ARROW "Enable Arrow support" OFF)
option(VELOX_ENABLE_REMOTE_FUNCTIONS "Enable remote function support" OFF)
option(VELOX_ENABLE_CCACHE "Use ccache if installed." ON)

Expand Down Expand Up @@ -183,7 +181,6 @@ if(${VELOX_BUILD_TESTING})
set(VELOX_ENABLE_TPCH_CONNECTOR ON)
set(VELOX_ENABLE_SPARK_FUNCTIONS ON)
set(VELOX_ENABLE_EXAMPLES ON)
set(VELOX_ENABLE_PARQUET ON)
endif()

if(${VELOX_ENABLE_BENCHMARKS})
Expand Down Expand Up @@ -280,14 +277,6 @@ endif()
if(VELOX_ENABLE_HDFS)
add_definitions(-DVELOX_ENABLE_HDFS)
# JVM libhdfs requires arrow dependency.
set(VELOX_ENABLE_ARROW ON)
endif()

if(VELOX_ENABLE_PARQUET)
add_definitions(-DVELOX_ENABLE_PARQUET)
# Native Parquet reader requires Apache Thrift and Arrow Parquet writer, which
# are included in Arrow.
set(VELOX_ENABLE_ARROW ON)
endif()

# make buildPartitionBounds_ a vector int64 instead of int32 to avoid integer
Expand Down Expand Up @@ -633,9 +622,9 @@ if("${TREAT_WARNINGS_AS_ERRORS}")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Werror")
endif()

if(VELOX_ENABLE_ARROW)
velox_set_source(Arrow)
velox_resolve_dependency(Arrow)
endif()
# Native Parquet reader requires Apache Thrift and Arrow Parquet writer, which
# are included in Arrow.
velox_set_source(Arrow)
velox_resolve_dependency(Arrow)

add_subdirectory(velox)
3 changes: 1 addition & 2 deletions docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -52,8 +52,7 @@ services:
environment:
NUM_THREADS: 8 # default value for NUM_THREADS
CCACHE_DIR: "/velox/.ccache"
EXTRA_CMAKE_FLAGS: -DVELOX_ENABLE_PARQUET=ON
-DVELOX_ENABLE_S3=ON
EXTRA_CMAKE_FLAGS: -DVELOX_ENABLE_S3=ON
volumes:
- .:/velox:delegated
working_dir: /velox
Expand Down
8 changes: 0 additions & 8 deletions velox/connectors/hive/HiveConnectorUtil.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,11 +26,7 @@
#include "velox/dwio/common/Reader.h"
#include "velox/dwio/dwrf/common/Config.h"
#include "velox/dwio/dwrf/writer/Writer.h"

#ifdef VELOX_ENABLE_PARQUET
#include "velox/dwio/parquet/writer/Writer.h" // @manual
#endif

#include "velox/expression/Expr.h"
#include "velox/expression/ExprToSubfieldFilter.h"
#include "velox/type/TimestampConversion.h"
Expand Down Expand Up @@ -905,7 +901,6 @@ core::TypedExprPtr extractFiltersFromRemainingFilter(

namespace {

#ifdef VELOX_ENABLE_PARQUET
std::optional<TimestampUnit> getTimestampUnit(
const config::ConfigBase& config,
const char* configKey) {
Expand Down Expand Up @@ -966,7 +961,6 @@ void updateParquetWriterOptions(

writerOptions = std::move(parquetWriterOptions);
}
#endif

void updateDWRFWriterOptions(
const std::shared_ptr<const HiveConfig>& hiveConfig,
Expand Down Expand Up @@ -1038,9 +1032,7 @@ void updateWriterOptionsFromHiveConfig(
updateDWRFWriterOptions(hiveConfig, sessionProperties, writerOptions);
break;
case dwio::common::FileFormat::PARQUET:
#ifdef VELOX_ENABLE_PARQUET
updateParquetWriterOptions(hiveConfig, sessionProperties, writerOptions);
#endif
break;
case dwio::common::FileFormat::NIMBLE:
// No-op for now.
Expand Down
18 changes: 6 additions & 12 deletions velox/connectors/hive/tests/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -26,23 +26,17 @@ add_executable(
TableHandleTest.cpp)
add_test(velox_hive_connector_test velox_hive_connector_test)

target_include_directories(velox_hive_connector_test
PUBLIC ${ARROW_PREFIX}/install/include)
target_link_libraries(
velox_hive_connector_test
velox_dwio_common_exception
velox_dwio_native_parquet_reader
velox_exec
velox_exec_test_lib
velox_hive_connector
velox_hive_partition_function
velox_dwio_common_exception
velox_vector_fuzzer
velox_vector_test_lib
velox_exec
velox_exec_test_lib
GTest::gtest
GTest::gtest_main)

if(VELOX_ENABLE_PARQUET)

target_include_directories(velox_hive_connector_test
PUBLIC ${ARROW_PREFIX}/install/include)
target_link_libraries(velox_hive_connector_test
velox_dwio_native_parquet_reader)

endif()
8 changes: 1 addition & 7 deletions velox/connectors/hive/tests/HiveConnectorUtilTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,13 +19,9 @@
#include "velox/connectors/hive/HiveConfig.h"
#include "velox/connectors/hive/HiveConnectorSplit.h"
#include "velox/connectors/hive/TableHandle.h"
#include "velox/exec/tests/utils/HiveConnectorTestBase.h"

#include "velox/dwio/dwrf/writer/Writer.h"

#ifdef VELOX_ENABLE_PARQUET
#include "velox/dwio/parquet/writer/Writer.h"
#endif
#include "velox/exec/tests/utils/HiveConnectorTestBase.h"

namespace facebook::velox::connector {

Expand Down Expand Up @@ -403,7 +399,6 @@ TEST_F(
"2");
}

#ifdef VELOX_ENABLE_PARQUET
TEST_F(HiveConnectorUtilTest, updateWriterOptionsFromHiveConfigParquet) {
auto fileFormat = dwio::common::FileFormat::PARQUET;
std::unordered_map<std::string, std::string> connectorConfig = {
Expand All @@ -427,6 +422,5 @@ TEST_F(HiveConnectorUtilTest, updateWriterOptionsFromHiveConfigParquet) {
parquetOptions->parquetWriteTimestampUnit.value(), TimestampUnit::kMilli);
ASSERT_EQ(parquetOptions->parquetWriteTimestampTimeZone.value(), "UTC");
}
#endif

} // namespace facebook::velox::connector
6 changes: 0 additions & 6 deletions velox/connectors/hive/tests/HiveDataSinkTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -27,12 +27,8 @@
#include "velox/dwio/dwrf/reader/DwrfReader.h"
#include "velox/dwio/dwrf/writer/FlushPolicy.h"
#include "velox/dwio/dwrf/writer/Writer.h"

#ifdef VELOX_ENABLE_PARQUET
#include "velox/dwio/parquet/reader/ParquetReader.h"
#include "velox/dwio/parquet/writer/Writer.h"
#endif

#include "velox/exec/tests/utils/PlanBuilder.h"
#include "velox/exec/tests/utils/TempDirectoryPath.h"
#include "velox/vector/fuzzer/VectorFuzzer.h"
Expand Down Expand Up @@ -1131,7 +1127,6 @@ TEST_F(HiveDataSinkTest, insertTableHandleToString) {
"HiveInsertTableHandle [dwrf zstd], [inputColumns: [ HiveColumnHandle [name: c0, columnType: Regular, dataType: BIGINT, requiredSubfields: [ ]] HiveColumnHandle [name: c1, columnType: Regular, dataType: INTEGER, requiredSubfields: [ ]] HiveColumnHandle [name: c2, columnType: Regular, dataType: SMALLINT, requiredSubfields: [ ]] HiveColumnHandle [name: c3, columnType: Regular, dataType: REAL, requiredSubfields: [ ]] HiveColumnHandle [name: c4, columnType: Regular, dataType: DOUBLE, requiredSubfields: [ ]] HiveColumnHandle [name: c5, columnType: PartitionKey, dataType: VARCHAR, requiredSubfields: [ ]] HiveColumnHandle [name: c6, columnType: PartitionKey, dataType: BOOLEAN, requiredSubfields: [ ]] ], locationHandle: LocationHandle [targetPath: /path/to/test, writePath: /path/to/test, tableType: kNew,, bucketProperty: \nHiveBucketProperty[<HIVE_COMPATIBLE 4>\n\tBucket Columns:\n\t\tc5\n\tBucket Types:\n\t\tVARCHAR\n\tSortedBy Columns:\n\t\t[COLUMN[c5] ORDER[DESC NULLS LAST]]\n]\n]");
}

#ifdef VELOX_ENABLE_PARQUET
TEST_F(HiveDataSinkTest, flushPolicyWithParquet) {
const auto outputDirectory = TempDirectoryPath::create();
auto flushPolicyFactory = []() {
Expand Down Expand Up @@ -1167,7 +1162,6 @@ TEST_F(HiveDataSinkTest, flushPolicyWithParquet) {
EXPECT_EQ(fileMeta.numRowGroups(), 10);
EXPECT_EQ(fileMeta.rowGroup(0).numRows(), 500);
}
#endif

TEST_F(HiveDataSinkTest, flushPolicyWithDWRF) {
const auto outputDirectory = TempDirectoryPath::create();
Expand Down
Loading

0 comments on commit b342ac0

Please sign in to comment.