diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 09f7a42b893d..47ae7a38efce 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -52,6 +52,7 @@ option(ENABLE_GCS "Enable GCS" OFF) option(ENABLE_S3 "Enable S3" OFF) option(ENABLE_HDFS "Enable HDFS" OFF) option(ENABLE_ORC "Enable ORC" OFF) +option(VELOX_ENABLE_ABFS "Enable ABFS" OFF) set(root_directory ${PROJECT_BINARY_DIR}) get_filename_component(GLUTEN_HOME ${CMAKE_SOURCE_DIR} DIRECTORY) diff --git a/cpp/compile.sh b/cpp/compile.sh index 986d0ec8cd29..b14e48b16fac 100755 --- a/cpp/compile.sh +++ b/cpp/compile.sh @@ -28,6 +28,7 @@ ENABLE_HBM=OFF ENABLE_GCS=OFF ENABLE_S3=OFF ENABLE_HDFS=OFF +ENABLE_ABFS=OFF VELOX_HOME= NPROC=$(nproc --ignore=2) @@ -81,6 +82,10 @@ for arg in "$@"; do ENABLE_S3=("${arg#*=}") shift # Remove argument name from processing ;; + --enable_abfs=*) + ENABLE_ABFS=("${arg#*=}") + shift # Remove argument name from processing + ;; --enable_hdfs=*) ENABLE_HDFS=("${arg#*=}") shift # Remove argument name from processing @@ -116,6 +121,7 @@ echo "BUILD_PROTOBUF=${BUILD_PROTOBUF}" echo "ENABLE_GCS=${ENABLE_GCS}" echo "ENABLE_S3=${ENABLE_S3}" echo "ENABLE_HDFS=${ENABLE_HDFS}" +echo "ENABLE_ABFS=${ENABLE_ABFS}" if [ -d build ]; then rm -r build @@ -135,5 +141,6 @@ cmake .. \ -DENABLE_HBM=${ENABLE_HBM} \ -DENABLE_GCS=${ENABLE_GCS} \ -DENABLE_S3=${ENABLE_S3} \ - -DENABLE_HDFS=${ENABLE_HDFS} + -DENABLE_HDFS=${ENABLE_HDFS} \ + -DENABLE_ABFS=${ENABLE_ABFS} make -j$NPROC diff --git a/cpp/velox/CMakeLists.txt b/cpp/velox/CMakeLists.txt index 0d4a99dfd658..0f347d133a04 100644 --- a/cpp/velox/CMakeLists.txt +++ b/cpp/velox/CMakeLists.txt @@ -149,6 +149,9 @@ macro(ADD_VELOX_DEPENDENCIES) if(ENABLE_S3) add_velox_dependency(connector::hive::s3fs "${VELOX_COMPONENTS_PATH}/connectors/hive/storage_adapters/s3fs/libvelox_s3fs.a") endif() + if(ENABLE_ABFS) + add_velox_dependency(connector::hive::abfs "${VELOX_COMPONENTS_PATH}/connectors/hive/storage_adapters/abfs/libvelox_abfs.a") + endif() add_velox_dependency(dwio::dwrf::writer "${VELOX_COMPONENTS_PATH}/dwio/dwrf/writer/libvelox_dwio_dwrf_writer.a") add_velox_dependency(dwio::dwrf::reader "${VELOX_COMPONENTS_PATH}/dwio/dwrf/reader/libvelox_dwio_dwrf_reader.a") add_velox_dependency(dwio::dwrf::utils "${VELOX_COMPONENTS_PATH}/dwio/dwrf/utils/libvelox_dwio_dwrf_utils.a") @@ -266,7 +269,10 @@ macro(find_gcssdk) find_package(google_cloud_cpp_storage REQUIRED) endmacro() - +macro(find_azure) + set (CMAKE_FIND_LIBRARY_SUFFIXES ".a") + find_package(azure-storage-blobs-cpp CONFIG REQUIRED) +endmacro() # Build Velox backend. set(VELOX_SRCS @@ -393,3 +399,9 @@ endif() if(BUILD_EXAMPLES) add_subdirectory(udf/examples) endif() + +if(ENABLE_ABFS) + add_definitions(-DENABLE_ABFS) + find_azure() + target_link_libraries(velox PRIVATE Azure::azure-storage-blobs) +endif() \ No newline at end of file diff --git a/cpp/velox/compute/VeloxBackend.cc b/cpp/velox/compute/VeloxBackend.cc index 9dcbd9b49416..b25b35e49b88 100644 --- a/cpp/velox/compute/VeloxBackend.cc +++ b/cpp/velox/compute/VeloxBackend.cc @@ -34,6 +34,9 @@ #ifdef ENABLE_GCS #include #endif +#ifdef ENABLE_ABFS +#include "velox/connectors/hive/storage_adapters/abfs/AbfsFileSystem.h" +#endif #include "config/GlutenConfig.h" #include "jni/JniFileSystem.h" #include "operators/functions/SparkTokenizer.h" @@ -284,6 +287,19 @@ void VeloxBackend::initConnector(const facebook::velox::Config* conf) { mutableConf->setValue("hive.s3.path-style-access", pathStyleAccess ? "true" : "false"); #endif +#ifdef ENABLE_ABFS + velox::filesystems::abfs::registerAbfsFileSystem(); + const auto& confValue = conf->valuesCopy(); + for (auto& [k, v] : confValue) { + if (k.find("fs.azure.account.key") == 0) { + mutableConf->setValue(k, v); + } else if (k.find("spark.hadoop.fs.azure.account.key") == 0) { + constexpr int32_t accountKeyPrefixLength = 13; + mutableConf->setValue(k.substr(accountKeyPrefixLength), v); + } + } +#endif + #ifdef ENABLE_GCS // https://github.com/GoogleCloudDataproc/hadoop-connectors/blob/master/gcs/CONFIGURATION.md#api-client-configuration auto gsStorageRootUrl = conf->get("spark.hadoop.fs.gs.storage.root.url"); diff --git a/dev/builddeps-veloxbe.sh b/dev/builddeps-veloxbe.sh index 588221d28aef..ce7c76e540e6 100755 --- a/dev/builddeps-veloxbe.sh +++ b/dev/builddeps-veloxbe.sh @@ -20,6 +20,7 @@ ENABLE_HBM=OFF ENABLE_GCS=OFF ENABLE_S3=OFF ENABLE_HDFS=OFF +ENABLE_ABFS=OFF ENABLE_EP_CACHE=OFF SKIP_BUILD_EP=OFF ARROW_ENABLE_CUSTOM_CODEC=OFF @@ -78,6 +79,10 @@ do ENABLE_HDFS=("${arg#*=}") shift # Remove argument name from processing ;; + --enable_abfs=*) + ENABLE_ABFS=("${arg#*=}") + shift # Remove argument name from processing + ;; --enable_ep_cache=*) ENABLE_EP_CACHE=("${arg#*=}") shift # Remove argument name from processing @@ -106,8 +111,8 @@ fi ##install velox if [ "$SKIP_BUILD_EP" != "ON" ]; then cd $GLUTEN_DIR/ep/build-velox/src - ./get_velox.sh --enable_hdfs=$ENABLE_HDFS --build_protobuf=$BUILD_PROTOBUF --enable_s3=$ENABLE_S3 --enable_gcs=$ENABLE_GCS - ./build_velox.sh --enable_s3=$ENABLE_S3 --enable_gcs=$ENABLE_GCS --build_type=$BUILD_TYPE --enable_hdfs=$ENABLE_HDFS \ + ./get_velox.sh --enable_hdfs=$ENABLE_HDFS --build_protobuf=$BUILD_PROTOBUF --enable_s3=$ENABLE_S3 --enable_gcs=$ENABLE_GCS --enable_abfs=$ENABLE_ABFS + ./build_velox.sh --enable_s3=$ENABLE_S3 --enable_gcs=$ENABLE_GCS --build_type=$BUILD_TYPE --enable_hdfs=$ENABLE_HDFS --enable_abfs=$ENABLE_ABFS \ --enable_ep_cache=$ENABLE_EP_CACHE --build_tests=$BUILD_TESTS --build_benchmarks=$BUILD_BENCHMARKS fi @@ -118,5 +123,5 @@ mkdir build cd build cmake -DBUILD_VELOX_BACKEND=ON -DCMAKE_BUILD_TYPE=$BUILD_TYPE \ -DBUILD_TESTS=$BUILD_TESTS -DBUILD_EXAMPLES=$BUILD_EXAMPLES -DBUILD_BENCHMARKS=$BUILD_BENCHMARKS -DBUILD_JEMALLOC=$BUILD_JEMALLOC \ - -DENABLE_HBM=$ENABLE_HBM -DENABLE_QAT=$ENABLE_QAT -DENABLE_IAA=$ENABLE_IAA -DENABLE_GCS=$ENABLE_GCS -DENABLE_S3=$ENABLE_S3 -DENABLE_HDFS=$ENABLE_HDFS .. + -DENABLE_HBM=$ENABLE_HBM -DENABLE_QAT=$ENABLE_QAT -DENABLE_IAA=$ENABLE_IAA -DENABLE_GCS=$ENABLE_GCS -DENABLE_S3=$ENABLE_S3 -DENABLE_HDFS=$ENABLE_HDFS -DENABLE_ABFS=$ENABLE_ABFS .. make -j diff --git a/ep/build-velox/src/build_velox.sh b/ep/build-velox/src/build_velox.sh index 06bda8b4a103..5409cc2f382e 100755 --- a/ep/build-velox/src/build_velox.sh +++ b/ep/build-velox/src/build_velox.sh @@ -21,6 +21,8 @@ ENABLE_S3=OFF ENABLE_GCS=OFF #Set on run gluten on HDFS ENABLE_HDFS=OFF +#Set on run gluten on ABFS +ENABLE_ABFS=OFF BUILD_TYPE=release VELOX_HOME="" ENABLE_EP_CACHE=OFF @@ -49,6 +51,10 @@ for arg in "$@"; do ENABLE_HDFS=("${arg#*=}") shift # Remove argument name from processing ;; + --enable_abfs=*) + ENABLE_ABFS=("${arg#*=}") + shift # Remove argument name from processing + ;; --build_type=*) BUILD_TYPE=("${arg#*=}") shift # Remove argument name from processing @@ -113,6 +119,9 @@ function compile { if [ $ENABLE_HDFS == "ON" ]; then COMPILE_OPTION="$COMPILE_OPTION -DVELOX_ENABLE_HDFS=ON" fi + if [ $ENABLE_ABFS == "ON" ]; then + COMPILE_OPTION="$COMPILE_OPTION -DVELOX_ENABLE_ABFS=ON" + fi if [ $ENABLE_S3 == "ON" ]; then COMPILE_OPTION="$COMPILE_OPTION -DVELOX_ENABLE_S3=ON" fi @@ -254,6 +263,7 @@ echo "VELOX_HOME=${VELOX_HOME}" echo "ENABLE_S3=${ENABLE_S3}" echo "ENABLE_GCS=${ENABLE_GCS}" echo "ENABLE_HDFS=${ENABLE_HDFS}" +echo "ENABLE_ABFS=${ENABLE_ABFS}" echo "BUILD_TYPE=${BUILD_TYPE}" cd ${VELOX_HOME} diff --git a/shims/common/src/main/scala/io/glutenproject/GlutenConfig.scala b/shims/common/src/main/scala/io/glutenproject/GlutenConfig.scala index 92dc9a291c98..ab26c127b2ac 100644 --- a/shims/common/src/main/scala/io/glutenproject/GlutenConfig.scala +++ b/shims/common/src/main/scala/io/glutenproject/GlutenConfig.scala @@ -335,6 +335,10 @@ object GlutenConfig { // Hardware acceleraters backend val GLUTEN_SHUFFLE_CODEC_BACKEND = "spark.gluten.sql.columnar.shuffle.codecBackend" + // ABFS config + val ABFS_ACCOUNT_KEY = "hadoop.fs.azure.account.key" + val SPARK_ABFS_ACCOUNT_KEY: String = "spark." + ABFS_ACCOUNT_KEY + // QAT config val GLUTEN_QAT_BACKEND_NAME = "qat" val GLUTEN_QAT_SUPPORTED_CODEC: Set[String] = Set("gzip", "zstd") @@ -536,6 +540,10 @@ object GlutenConfig { .filter(_._1.startsWith(HADOOP_PREFIX + S3A_PREFIX)) .foreach(entry => nativeConfMap.put(entry._1, entry._2)) + conf + .filter(_._1.startsWith(SPARK_ABFS_ACCOUNT_KEY)) + .foreach(entry => nativeConfMap.put(entry._1, entry._2)) + // return nativeConfMap }