Skip to content

Commit

Permalink
Find or Install Parquet (#249)
Browse files Browse the repository at this point in the history
* Find pyarrow module automatically for Arrow and Parquet

- Find pyarrow module executing python script if A) PIP_PYARROW_ROOT is not set and B) find_package can't find Parquet or Arrow

* Install pyarrow if not found

* Install pyarrow if not found

* Install pyarrow v16.1.* in venv

* Upgrade pip in venv before install pyarrow

* Update FindArrowParquet.cmake

Small typo

---------

Co-authored-by: Keita Iwabuchi <[email protected]>
Co-authored-by: Trevor Steil <[email protected]>
  • Loading branch information
3 people authored Sep 16, 2024
1 parent 8d51dd3 commit 41e4681
Show file tree
Hide file tree
Showing 4 changed files with 259 additions and 59 deletions.
2 changes: 1 addition & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -144,7 +144,7 @@ endif ()
#
include(FindArrowParquet)
option(YGM_REQUIRE_ARROW_PARQUET "YGM requires Apache Arrow Parquet." OFF)
find_arrow_parquet()
find_or_install_arrow_parquet()

#
# Create the YGM target library
Expand Down
195 changes: 137 additions & 58 deletions cmake/FindArrowParquet.cmake
Original file line number Diff line number Diff line change
@@ -1,44 +1,4 @@
# Find Arrow and Parquet using find_package
function(find_arrow_parquet_config)
# Find Arrow >= 8.0.
# Start major version from 100 so that we do not have to update
# this code every time Arrow releases a major version.
foreach (MAJOR_VERSION RANGE 100 8 -1)
find_package(Arrow "${MAJOR_VERSION}.0" QUIET)
if (Arrow_FOUND)
break()
endif ()
endforeach ()
set(Arrow_FOUND ${Arrow_FOUND} PARENT_SCOPE)

# Find Parquet
if (Arrow_FOUND)
find_package(Parquet QUIET PATHS ${Arrow_DIR})
endif ()
set(Parquet_FOUND ${Parquet_FOUND} PARENT_SCOPE)

# Show Arrow and Parquet info
if (Arrow_FOUND AND Parquet_FOUND)
if (Arrow_FOUND)
message(STATUS ${PROJECT_NAME} " found Arrow")
message(STATUS "Arrow version: ${ARROW_VERSION}")
message(STATUS "Arrow SO version: ${ARROW_FULL_SO_VERSION}")
endif ()

if (Parquet_FOUND)
message(STATUS ${PROJECT_NAME} " found Parquet")
message(STATUS "Parquet version: ${PARQUET_VERSION}")
message(STATUS "Parquet SO version: ${PARQUET_FULL_SO_VERSION}")
endif ()
else ()
if (YGM_REQUIRE_ARROW_PARQUET)
message(FATAL_ERROR "${PROJECT_NAME} requires Arrow Parquet >= 8.0 but Arrow Parquet was not found.")
else ()
message(WARNING "${PROJECT_NAME} did not find Arrow Parquet >= 8.0. Building without Arrow Parquet.")
endif ()
endif ()
endfunction()

include(PythonUtilities)

# Find Arrow and Parquet installed along with pyarrow by pip.
#
Expand All @@ -60,7 +20,7 @@ endfunction()
# If Arrow and Parquet are found, set Arrow_FOUND and Parquet_FOUND to TRUE.
# Also, Arrow::arrow_shared and Parquet::parquet_shared are created as imported targets.
# Those targets can be used to link Arrow and Parquet as find_package() is used.
function(find_pyarrow)
function(find_pip_installed_pyarrow)
if (PIP_PYARROW_ROOT)
# Find libarrow
file(GLOB Arrow_LIBRARIES LIST_DIRECTORIES false "${PIP_PYARROW_ROOT}/libarrow.so.*")
Expand Down Expand Up @@ -116,12 +76,6 @@ function(find_pyarrow)
endif ()

message(STATUS "Arrow include dir: ${Arrow_INCLUDE_DIRS}")
else () # Arrow or Parquet not found
if (YGM_REQUIRE_ARROW_PARQUET)
message(FATAL_ERROR "${PROJECT_NAME} requires Arrow Parquet but Arrow Parquet was not found.")
else ()
message(WARNING "${PROJECT_NAME} did not find Arrow Parquet. Building without Arrow Parquet.")
endif ()
endif ()
else ()
message(FATAL_ERROR "PIP_PYARROW_ROOT is not set. PIP_PYARROW_ROOT must be set to the root of the pyarrow installation.")
Expand All @@ -130,29 +84,154 @@ function(find_pyarrow)
endfunction()


# Find Arrow and Parquet using find_arrow or find_pyarrow
# If PIP_PYARROW_ROOT is set, find_pyarrow is used.
# Find the directory where pyarrow is installed.
# This function executes a Python script to find the pyarrow module and
# **does not assume that pyarrow is installed by pip**.
#
# Output:
# Arrow_FOUND and Parquet_FOUND are set to TRUE if Arrow and Parquet are found.
function(find_arrow_parquet)
if (PIP_PYARROW_ROOT)
find_pyarrow()
else ()
find_arrow_parquet_config()
# PYARROW_ROOT is set to the root of the pyarrow installation.
function(find_pyarrow_package)
find_python3_module(pyarrow)
if (PYTHON3_MODULE_PATH)
get_filename_component(PYARROW_ROOT ${PYTHON3_MODULE_PATH} DIRECTORY)
set(PYARROW_ROOT ${PYARROW_ROOT} PARENT_SCOPE)
endif ()
endfunction()

# Install pyarrow using pip
# Output:
# PIP_PYARROW_ROOT is set to the root of the pyarrow installation.
function(install_pyarrow_in_venv)
setup_python_venv()
if (NOT PYTHON_VENV_ROOT)
return()
endif ()

activate_python_venv(${PYTHON_VENV_ROOT})
if (NOT PYTHON_VENV_ACTIVATED)
return()
endif ()

# Use only the Python 3 interpreter in the virtual environment
set(Python3_FIND_VIRTUALENV ONLY)

# Upgrade pip
# Ignore the error status as failing to upgrade is not the end of the world
upgrade_pip()

# Install pyarrow
pip_install_python_package("pyarrow==16.1.*")
if (PIP_INSTALL_SUCCEEDED)
find_pyarrow_package()
if (PYARROW_ROOT)
set(PIP_PYARROW_ROOT ${PYARROW_ROOT} PARENT_SCOPE)
endif ()
endif ()

deactivate_python_venv()
endfunction()


# Find Arrow and Parquet using find_package
# Output:
# Arrow_FOUND is set to TRUE if Arrow is found.
# Parquet_FOUND is set to TRUE if Parquet is found.
function(find_arrow_parquet_config)
# Find Arrow >= 8.0.
# Start major version from 100 so that we do not have to update
# this code every time Arrow releases a major version.
foreach (MAJOR_VERSION RANGE 100 8 -1)
find_package(Arrow "${MAJOR_VERSION}.0" QUIET)
if (Arrow_FOUND)
break()
endif ()
endforeach ()
set(Arrow_FOUND ${Arrow_FOUND} PARENT_SCOPE)

# Find Parquet
if (Arrow_FOUND)
find_package(Parquet QUIET PATHS ${Arrow_DIR})
endif ()
set(Parquet_FOUND ${Parquet_FOUND} PARENT_SCOPE)

# Show Arrow and Parquet info
if (Arrow_FOUND AND Parquet_FOUND)
if (Arrow_FOUND)
message(STATUS ${PROJECT_NAME} " found Arrow")
message(STATUS "Arrow version: ${ARROW_VERSION}")
message(STATUS "Arrow SO version: ${ARROW_FULL_SO_VERSION}")
endif ()

if (Parquet_FOUND)
message(STATUS ${PROJECT_NAME} " found Parquet")
message(STATUS "Parquet version: ${PARQUET_VERSION}")
message(STATUS "Parquet SO version: ${PARQUET_FULL_SO_VERSION}")
endif ()
endif ()
endfunction()

# Find Arrow and Parquet. If not found, install pyarrow using pip in a Python virtual environmental space.
# Input:
# PIP_PYARROW_ROOT (option) The root directory of a pyarrow installed by pip.
# YGM_REQUIRE_ARROW_PARQUET (option) If TRUE, an fatal error is thrown when Arrow Parquet is not found.
# Output:
# Arrow_FOUND and Parquet_FOUND are defined and set to TRUE if Arrow and Parquet are found.
function(find_or_install_arrow_parquet)
if (PIP_PYARROW_ROOT)
find_pip_installed_pyarrow()
if (NOT Arrow_FOUND OR NOT Parquet_FOUND)
if (YGM_REQUIRE_ARROW_PARQUET)
message(FATAL_ERROR "${PROJECT_NAME} requires Arrow Parquet but Arrow Parquet was not found in ${PIP_PYARROW_ROOT}.")
else ()
message(WARNING "${PROJECT_NAME} did not find Arrow Parquet in ${PIP_PYARROW_ROOT}. Building without Arrow Parquet.")
endif ()
return()
endif ()
endif ()

if (NOT Arrow_FOUND OR NOT Parquet_FOUND)
find_arrow_parquet_config()
endif ()

if (NOT Arrow_FOUND OR NOT Parquet_FOUND)
find_pyarrow_package()
if (PYARROW_ROOT)
# Assume that the found pip was installed by pip.
set(PIP_PYARROW_ROOT ${PYARROW_ROOT})
find_pip_installed_pyarrow()
endif ()
endif ()

if (NOT Arrow_FOUND OR NOT Parquet_FOUND)
install_pyarrow_in_venv()
if (PIP_PYARROW_ROOT)
find_pip_installed_pyarrow()
endif ()
endif ()

if (NOT Arrow_FOUND OR NOT Parquet_FOUND)
message(STATUS "${PROJECT_NAME} could not find Arrow Parquet.")
message(STATUS "If this is an unexpected result, try the following command to install pyarrow: export Python3_ROOT_DIR=/path/to/python3; /path/to/python3 -m pip pyarrow")
if (YGM_REQUIRE_ARROW_PARQUET)
message(FATAL_ERROR "${PROJECT_NAME} requires Arrow Parquet.")
else ()
message(WARNING "${PROJECT_NAME} keep the build process without Arrow Parquet.")
endif ()
return()
endif ()

set(Arrow_FOUND TRUE PARENT_SCOPE)
set(Parquet_FOUND TRUE PARENT_SCOPE)
endfunction()


# Link Arrow and Parquet to the target
# This function must be called after find_arrow_parquet().
# This function must be called after find_or_install_arrow_parquet().
function(link_arrow_parquet target)
if (Arrow_FOUND AND Parquet_FOUND)
target_link_libraries(${target} PUBLIC
Arrow::arrow_shared Parquet::parquet_shared)
else ()
message(WARNING "Arrow or Parquet not found. Not linking Arrow or Parquet.")
endif ()
endfunction()
endfunction()
18 changes: 18 additions & 0 deletions cmake/FindPython3Module.cmake
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
# Find a Python3 module using CMake's FindPython3 module.
# Input: module name to find
# Python3_ROOT_DIR can be used as a hint to find Python3
#
# Output: PYTHON3_MODULE_PATH is set to the path of the module if found
function(find_python3_module module_name)
find_package(Python3 COMPONENTS Interpreter REQUIRED)

execute_process(
COMMAND ${Python3_EXECUTABLE} -c "import importlib; import sys; module_name = '${module_name}'; spec = importlib.util.find_spec(module_name); print(spec.origin if spec else ''); sys.exit(0 if spec else 1)"
OUTPUT_VARIABLE MODULE_PATH
OUTPUT_STRIP_TRAILING_WHITESPACE
)

if (Python3_FOUND AND MODULE_PATH)
set(PYTHON3_MODULE_PATH ${MODULE_PATH} PARENT_SCOPE)
endif ()
endfunction()
103 changes: 103 additions & 0 deletions cmake/PythonUtilities.cmake
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
# Create and activate a Python3 virtual environment
#
# Output: PYTHON_VENV_ROOT is set to the path of the virtual environment
# if created successfully
function(setup_python_venv)
find_package(Python3 COMPONENTS Interpreter QUIET)
if (NOT Python3_Interpreter_FOUND)
message(WARNING "Python3 interpreter not found")
return()
endif()

set(PYTHON_VENV_ROOT "${CMAKE_BINARY_DIR}/${PROJECT_NAME}-venv")
execute_process(
COMMAND ${Python3_EXECUTABLE} -m venv ${PYTHON_VENV_ROOT}
RESULT_VARIABLE result
OUTPUT_QUIET
)
if (result EQUAL "0")
message(STATUS "Created Python virtual environment in ${PYTHON_VENV_ROOT}")
set(PYTHON_VENV_ROOT ${PYTHON_VENV_ROOT} PARENT_SCOPE)
endif()
endfunction()

# Activate a Python3 virtual environment
# Input: A path to the virtual environment
# Output: PYTHON_VENV_ACTIVATED is set to TRUE if activated successfully
function(activate_python_venv venv_path)
set (ENV{VIRTUAL_ENV} ${venv_path})
set(PYTHON_VENV_ACTIVATED TRUE PARENT_SCOPE)
endfunction()

# Deactivate a Python3 virtual environment
function(deactivate_python_venv)
unset(ENV{VIRTUAL_ENV})
set(PYTHON_VENV_ACTIVATED FALSE PARENT_SCOPE)
endfunction()

# Upgrade pip in the Python3 interpreter
# Output: PIP_UPGRADE_SUCCEEDED is set to TRUE if pip was upgraded successfully
function(upgrade_pip)
find_package(Python3 COMPONENTS Interpreter QUIET)
if (NOT Python3_Interpreter_FOUND)
message(WARNING "Python3 interpreter not found")
return()
endif()

execute_process(
COMMAND ${Python3_EXECUTABLE} -m pip install --upgrade pip
RESULT_VARIABLE result
OUTPUT_QUIET
)
if(result EQUAL "0")
set(PIP_UPGRADE_SUCCEEDED TRUE PARENT_SCOPE)
endif()
endfunction()

# Install a Python3 package using pip
#
# Input: A path to pip_executable and a package name
# Output: PIP_INSTALL_SUCCEEDED is set to TRUE
# if the package was installed successfully
function(pip_install_python_package package_name)
find_package(Python3 COMPONENTS Interpreter QUIET)
if (NOT Python3_Interpreter_FOUND)
message(WARNING "Python3 interpreter not found")
return()
endif()

execute_process(
COMMAND ${Python3_EXECUTABLE} -m pip install ${package_name}
RESULT_VARIABLE result
OUTPUT_QUIET
)
if(result EQUAL "0")
message(STATUS "Installed ${package_name}")
set(PIP_INSTALL_SUCCEEDED TRUE PARENT_SCOPE)
endif()
endfunction()

# Find a Python3 module using CMake's FindPython3 module.
# Input: module name to find
# Python3_ROOT_DIR can be used as a hint to find Python3
#
# Output: PYTHON3_MODULE_PATH is set to the path of the module if found
function(find_python3_module module_name)
find_package(Python3 COMPONENTS Interpreter QUIET)
if (NOT Python3_Interpreter_FOUND)
message(WARNING "Python3 interpreter not found")
return()
endif()

execute_process(
COMMAND ${Python3_EXECUTABLE} -c "import importlib.util; import sys; module_name = '${module_name}'; spec = importlib.util.find_spec(module_name); print(spec.origin if spec else ''); sys.exit(0 if spec else 1)"
OUTPUT_VARIABLE MODULE_PATH
OUTPUT_STRIP_TRAILING_WHITESPACE
RESULT_VARIABLE result
)

if (result EQUAL "0")
set(PYTHON3_MODULE_PATH ${MODULE_PATH} PARENT_SCOPE)
message(STATUS "Found Python module ${module_name} at ${MODULE_PATH}")
endif()
endfunction()

0 comments on commit 41e4681

Please sign in to comment.