From 1f2a4fbd3af8342e64627cf02024c003cb531f14 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Wed, 31 Jan 2024 16:03:28 -0500 Subject: [PATCH 1/8] [cmake] bump MADNESS, BTAS, and TA tags to sync with latest; MADNESS points to main repo now --- CMakeLists.txt | 2 +- .../ExternalDependenciesVersions.cmake | 7 +- cmake/modules/FindOrFetchBoost.cmake | 97 ++++++++++--------- cmake/modules/FindOrFetchMADNESS.cmake | 2 +- examples/CMakeLists.txt | 8 +- ttg/CMakeLists.txt | 17 +++- 6 files changed, 74 insertions(+), 59 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 684d3a58c..1607cfcb9 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -91,7 +91,7 @@ endif (BUILD_TESTING) #### optional prerequisites ########################### # Boost -include(FindOrFetchBoost) +include("${PROJECT_SOURCE_DIR}/cmake/modules/FindOrFetchBoost.cmake") # Cereal #include(FindOrFetchCereal) # C++ coroutines diff --git a/cmake/modules/ExternalDependenciesVersions.cmake b/cmake/modules/ExternalDependenciesVersions.cmake index 8d763f5db..b3d7c8e55 100644 --- a/cmake/modules/ExternalDependenciesVersions.cmake +++ b/cmake/modules/ExternalDependenciesVersions.cmake @@ -3,9 +3,10 @@ # need Boost.CallableTraits (header only, part of Boost 1.66 released in Dec 2017) for wrap.h to work set(TTG_TRACKED_BOOST_VERSION 1.66) +set(TTG_TRACKED_VG_CMAKE_KIT_TAG d5c0a6f9ff6dc97cbb5132912733e1eb1cf73f1e) # used to provide "real" FindOrFetchBoost set(TTG_TRACKED_CATCH2_VERSION 3.5.0) set(TTG_TRACKED_CEREAL_VERSION 1.3.0) -set(TTG_TRACKED_MADNESS_TAG cb195817d7807c4aead10ba200cd20649036cbae) +set(TTG_TRACKED_MADNESS_TAG 8788aea9758bfe6479cc23d39e6c77b7528009db) set(TTG_TRACKED_PARSEC_TAG 25d1931e863b6741e453112d2117d85ad32e7fba) -set(TTG_TRACKED_BTAS_TAG a02be0d29fb4a788ecef43de711dcd6d6f1cb6b8) -set(TTG_TRACKED_TILEDARRAY_TAG f0115e9e4a3f988224afbfb3c241e92171e916b8) +set(TTG_TRACKED_BTAS_TAG 4e8f5233aa7881dccdfcc37ce07128833926d3c2) +set(TTG_TRACKED_TILEDARRAY_TAG 493c109379a1b64ddd5ef59f7e33b95633b68d73) diff --git a/cmake/modules/FindOrFetchBoost.cmake b/cmake/modules/FindOrFetchBoost.cmake index e1733e925..ed1fc9b2a 100644 --- a/cmake/modules/FindOrFetchBoost.cmake +++ b/cmake/modules/FindOrFetchBoost.cmake @@ -1,50 +1,55 @@ -if (NOT TARGET Boost::boost) - find_package(Boost ${TTG_TRACKED_BOOST_VERSION} QUIET CONFIG OPTIONAL_COMPONENTS serialization) -endif(NOT TARGET Boost::boost) +# Boost can be discovered by every (sub)package but only the top package can *build* it ... +# in either case must declare the components used by TA +set(required_components + headers + callable_traits +) +if (TTG_PARSEC_USE_BOOST_SERIALIZATION) + list(APPEND required_components + serialization + iostreams + ) +endif() +if (DEFINED Boost_REQUIRED_COMPONENTS) + list(APPEND Boost_REQUIRED_COMPONENTS + ${required_components}) + list(REMOVE_DUPLICATES Boost_REQUIRED_COMPONENTS) +else() + set(Boost_REQUIRED_COMPONENTS "${required_components}" CACHE STRING "Components of Boost to discovered or built") +endif() +set(optional_components +) +if (DEFINED Boost_OPTIONAL_COMPONENTS) + list(APPEND Boost_OPTIONAL_COMPONENTS + ${optional_components} + ) + list(REMOVE_DUPLICATES Boost_OPTIONAL_COMPONENTS) +else() + set(Boost_OPTIONAL_COMPONENTS "${optional_components}" CACHE STRING "Optional components of Boost to discovered or built") +endif() -if (TARGET Boost::boost) - set(_msg "Found Boost at ${Boost_CONFIG}") - if (TARGET Boost::serialization) - list(APPEND _msg " includes Boost::serialization") - endif(TARGET Boost::serialization) - message(STATUS "${_msg}") +if (NOT DEFINED Boost_FETCH_IF_MISSING AND TTG_FETCH_BOOST) + set(Boost_FETCH_IF_MISSING 1) +endif() - # Boost::* targets by default are not GLOBAL, so to allow users of TTG to safely use them we need to make them global - # more discussion here: https://gitlab.kitware.com/cmake/cmake/-/issues/17256 - foreach(tgt boost;headers;${Boost_BTAS_DEPS_LIBRARIES}) - if (TARGET Boost::${tgt}) - get_target_property(_boost_tgt_${tgt}_is_imported_global Boost::${tgt} IMPORTED_GLOBAL) - if (NOT _boost_tgt_${tgt}_is_imported_global) - set_target_properties(Boost::${tgt} PROPERTIES IMPORTED_GLOBAL TRUE) - endif() - unset(_boost_tgt_${tgt}_is_imported_global) +# Bring ValeevGroup cmake toolkit, if not yet available +if (NOT DEFINED vg_cmake_kit_SOURCE_DIR) + include(FetchContent) + if (DEFINED PROJECT_BINARY_DIR) + set(VG_CMAKE_KIT_PREFIX_DIR PROJECT_BINARY_DIR) + else () + set(VG_CMAKE_KIT_PREFIX_DIR CMAKE_CURRENT_BINARY_DIR) endif() - endforeach() - -elseif (TTG_FETCH_BOOST) - - FetchContent_Declare( - CMAKEBOOST - GIT_REPOSITORY https://github.com/Orphis/boost-cmake - ) - FetchContent_MakeAvailable(CMAKEBOOST) - FetchContent_GetProperties(CMAKEBOOST - SOURCE_DIR CMAKEBOOST_SOURCE_DIR - BINARY_DIR CMAKEBOOST_BINARY_DIR - ) - - # current boost-cmake/master does not install boost correctly, so warn that installed TTG will not be usable - # boost-cmake/install_rules https://github.com/Orphis/boost-cmake/pull/45 is supposed to fix it but is inactive - message(WARNING "Building Boost from source makes TTG unusable from the install location! Install Boost using package manager or manually and reconfigure/reinstall TTG to fix this") - - if (TARGET Boost::serialization AND TARGET Boost_serialization) - install(TARGETS Boost_serialization EXPORT boost) - export(EXPORT boost - FILE "${PROJECT_BINARY_DIR}/boost-targets.cmake") - install(EXPORT boost - FILE "boost-targets.cmake" - DESTINATION "${CMAKE_INSTALL_CMAKEDIR}" - COMPONENT boost-libs) - endif() - + FetchContent_Declare( + vg_cmake_kit + QUIET + GIT_REPOSITORY https://github.com/ValeevGroup/kit-cmake.git + GIT_TAG ${TTG_TRACKED_VG_CMAKE_KIT_TAG} + SOURCE_DIR ${${VG_CMAKE_KIT_PREFIX_DIR}}/cmake/vg + BINARY_DIR ${${VG_CMAKE_KIT_PREFIX_DIR}}/cmake/vg-build + SUBBUILD_DIR ${${VG_CMAKE_KIT_PREFIX_DIR}}/cmake/vg-subbuild + ) + FetchContent_MakeAvailable(vg_cmake_kit) endif() +include(${vg_cmake_kit_SOURCE_DIR}/modules/FindOrFetchBoost.cmake) + diff --git a/cmake/modules/FindOrFetchMADNESS.cmake b/cmake/modules/FindOrFetchMADNESS.cmake index f112e4ff7..24f0bc798 100644 --- a/cmake/modules/FindOrFetchMADNESS.cmake +++ b/cmake/modules/FindOrFetchMADNESS.cmake @@ -10,7 +10,7 @@ if (NOT TARGET MADworld) set(MADNESS_TASK_BACKEND PaRSEC CACHE STRING "The task backend to use for MADNESS tasks") FetchContent_Declare( MADNESS - GIT_REPOSITORY https://github.com/therault/madness.git + GIT_REPOSITORY https://github.com/m-a-d-n-e-s-s/madness.git GIT_TAG ${TTG_TRACKED_MADNESS_TAG} ) FetchContent_MakeAvailable(MADNESS) diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index 87aa0b127..10b808540 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -10,7 +10,7 @@ if (TARGET tiledarray) add_ttg_executable(spmm spmm/spmm.cc LINK_LIBRARIES TiledArray_Eigen) # block-sparse needs BTAS ... it's always provided by TA # since only need to use matrices, limit BTAS_TARGET_MAX_INDEX_RANK to 2 - add_ttg_executable(bspmm spmm/spmm.cc LINK_LIBRARIES tiledarray TiledArray_Eigen BTAS Boost::boost COMPILE_DEFINITIONS BLOCK_SPARSE_GEMM=1;BTAS_TARGET_MAX_INDEX_RANK=2) + add_ttg_executable(bspmm spmm/spmm.cc LINK_LIBRARIES tiledarray TiledArray_Eigen BTAS COMPILE_DEFINITIONS BLOCK_SPARSE_GEMM=1;BTAS_TARGET_MAX_INDEX_RANK=2) add_ttg_executable(testing_dpotrf potrf/testing_dpotrf.cc LINK_LIBRARIES tiledarray lapackpp) add_ttg_executable(testing_dtrtri potrf/testing_dtrtri.cc LINK_LIBRARIES tiledarray lapackpp) @@ -19,7 +19,7 @@ if (TARGET tiledarray) if (TARGET CUDA::cublas) add_ttg_executable(bspmm-cuda spmm/spmm_cuda.cc - LINK_LIBRARIES tiledarray TiledArray_Eigen BTAS Boost::boost CUDA::cublas + LINK_LIBRARIES tiledarray TiledArray_Eigen BTAS CUDA::cublas COMPILE_DEFINITIONS BLOCK_SPARSE_GEMM=1;BTAS_TARGET_MAX_INDEX_RANK=2 RUNTIMES "parsec") @@ -31,7 +31,7 @@ if (TARGET tiledarray) endif(TARGET CUDA::cusolver) elseif (TARGET roc::hipblas) add_ttg_executable(bspmm-hip spmm/spmm_cuda.cc - LINK_LIBRARIES tiledarray TiledArray_Eigen BTAS Boost::boost roc::hipblas + LINK_LIBRARIES tiledarray TiledArray_Eigen roc::hipblas COMPILE_DEFINITIONS BLOCK_SPARSE_GEMM=1;BTAS_TARGET_MAX_INDEX_RANK=2 RUNTIMES "parsec") if (TARGET roc::hipsolver) @@ -42,7 +42,7 @@ if (TARGET tiledarray) endif(TARGET roc::hipsolver) elseif (TARGET MKL::MKL_DPCPP) add_ttg_executable(bspmm-lz spmm/spmm_cuda.cc - LINK_LIBRARIES tiledarray TiledArray_Eigen BTAS Boost::boost MKL::MKL_DPCPP level_zero::ze_loader m + LINK_LIBRARIES tiledarray TiledArray_Eigen BTAS MKL::MKL_DPCPP level_zero::ze_loader m COMPILE_DEFINITIONS BLOCK_SPARSE_GEMM=1;BTAS_TARGET_MAX_INDEX_RANK=2 RUNTIMES "parsec") endif() diff --git a/ttg/CMakeLists.txt b/ttg/CMakeLists.txt index 9a08adc6a..5d40c5b20 100644 --- a/ttg/CMakeLists.txt +++ b/ttg/CMakeLists.txt @@ -94,9 +94,13 @@ set_source_files_properties(${CMAKE_CURRENT_SOURCE_DIR}/ttg/util/version.cc COMPILE_DEFINITIONS "TTG_GIT_REVISION=\"${TTG_GIT_REVISION}\";TTG_GIT_DESCRIPTION=\"${TTG_GIT_DESCRIPTION}\"") # optional dependencies -if (TARGET Boost::boost) - list(APPEND ttg-deps Boost::boost) -else () # if Boost::boost is missing must use bundled Boost.CallableTraits +if (TARGET Boost::headers) + if (TARGET Boost::callable_traits) # using modularized Boost? + list(APPEND ttg-deps Boost::callable_traits) + else() + list(APPEND ttg-deps Boost::headers) + endif() +else () # if Boost::headers is missing must use bundled Boost.CallableTraits list(APPEND ttg-defs "$") list(APPEND ttg-incs "$") endif () @@ -167,6 +171,11 @@ if (TARGET MADworld) endif(TARGET MADworld) if (TARGET Boost::serialization) list(APPEND ttg-serialization-deps Boost::serialization) + list(APPEND ttg-serialization-boost-deps Boost::serialization) + if (TARGET Boost::iostreams) # using modularized Boost? + list(APPEND ttg-serialization-deps Boost::iostreams) + list(APPEND ttg-serialization-boost-deps Boost::iostreams) + endif() list(APPEND ttg-serialization-compile-definitions TTG_SERIALIZATION_SUPPORTS_BOOST=1) endif (TARGET Boost::serialization) if (TARGET cereal::cereal) @@ -192,7 +201,7 @@ if (TARGET Boost::serialization) add_ttg_library(ttg-serialization-boost "${ttg-serialization-sources}" PUBLIC_HEADER "${ttg-serialization-headers}" - LINK_LIBRARIES "Boost::serialization" + LINK_LIBRARIES "${ttg-serialization-boost-deps}" COMPILE_DEFINITIONS "TTG_SERIALIZATION_SUPPORTS_BOOST=1") endif(TARGET Boost::serialization) # make cereal-only serialization target From 1125f96568bd987f75bb37ab546c721a1e930f41 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Wed, 31 Jan 2024 17:14:03 -0500 Subject: [PATCH 2/8] fixup --- ttg/ttg/parsec/ptr.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ttg/ttg/parsec/ptr.h b/ttg/ttg/parsec/ptr.h index a20c0c746..8184d2657 100644 --- a/ttg/ttg/parsec/ptr.h +++ b/ttg/ttg/parsec/ptr.h @@ -38,7 +38,7 @@ namespace ttg_parsec { void register_self() { /* insert ourselves from the list of ptr */ - std::lock_guard {m_ptr_map_mtx}; + std::lock_guard _{m_ptr_map_mtx}; m_ptr_map.insert(std::pair{this, true}); } From 549ba94193e296f6248b1bf95ef24bf469fc0706 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Wed, 31 Jan 2024 17:14:42 -0500 Subject: [PATCH 3/8] introduce initialize_fpe to turn on/off floating-point exceptions --- ttg/ttg/util/bug.cpp | 98 ++++++++++++++++++++++++++++++++++++++++++++ ttg/ttg/util/bug.h | 9 ++++ 2 files changed, 107 insertions(+) diff --git a/ttg/ttg/util/bug.cpp b/ttg/ttg/util/bug.cpp index 1e91e8fd2..27b743096 100644 --- a/ttg/ttg/util/bug.cpp +++ b/ttg/ttg/util/bug.cpp @@ -28,6 +28,7 @@ #include "bug.h" #include +#include #include #include #include @@ -46,6 +47,103 @@ using namespace std; using namespace ttg; +namespace ttg { + void initialize_fpe() { +#if defined(__APPLE__) && defined(__MACH__) + + // Public domain polyfill for feenableexcept on OS X + // http://www-personal.umich.edu/~williams/archive/computation/fe-handling-example.c + +#ifndef HAVE_FEENABLEEXCEPT + auto feenableexcept = [](int excepts) -> int { + static fenv_t fenv; + const auto new_excepts = excepts & FE_ALL_EXCEPT; + + if (fegetenv(&fenv)) { + return -1; + } +#if defined(__x86_64__) + // previous masks + const unsigned int old_excepts = fenv.__control & FE_ALL_EXCEPT; + + // unmask + fenv.__control &= ~new_excepts; + fenv.__mxcsr &= ~(new_excepts << 7); +#elif defined(__arm64__) + if (new_excepts & FE_INVALID) fenv.__fpcr |= __fpcr_trap_invalid; + if (new_excepts & FE_DIVBYZERO) fenv.__fpcr |= __fpcr_trap_divbyzero; + if (new_excepts & FE_OVERFLOW) fenv.__fpcr |= __fpcr_trap_overflow; + if (new_excepts & FE_UNDERFLOW) fenv.__fpcr |= __fpcr_trap_underflow; + if (new_excepts & FE_INEXACT) fenv.__fpcr |= __fpcr_trap_inexact; +#else +#error "MacOS on unknown architecture" +#endif + return fesetenv(&fenv); + }; +#define HAVE_FEENABLEEXCEPT 1 +#endif // not defined HAVE_FEENABLEEXCEPT + +#ifndef HAVE_FEDISABLEEXCEPT + auto fedisableexcept = [](int excepts) -> int { + static fenv_t fenv; + const auto new_excepts = excepts & FE_ALL_EXCEPT; + // all previous masks + + if (fegetenv(&fenv)) { + return -1; + } +#if defined(__x86_64__) + const unsigned int old_excepts = fenv.__control & FE_ALL_EXCEPT; + + // mask + fenv.__control |= new_excepts; + fenv.__mxcsr |= new_excepts << 7; +#elif defined(__arm64__) + if (new_excepts & FE_INVALID) fenv.__fpcr &= ~__fpcr_trap_invalid; + if (new_excepts & FE_DIVBYZERO) fenv.__fpcr &= ~__fpcr_trap_divbyzero; + if (new_excepts & FE_OVERFLOW) fenv.__fpcr &= ~__fpcr_trap_overflow; + if (new_excepts & FE_UNDERFLOW) fenv.__fpcr &= ~__fpcr_trap_underflow; + if (new_excepts & FE_INEXACT) fenv.__fpcr &= ~__fpcr_trap_inexact; +#else +#error "MacOS on unknown architecture" +#endif + + return fesetenv(&fenv); + }; + +#define HAVE_FEDISABLEEXCEPT 1 +#endif // not defined HAVE_FEDISABLEEXCEPT +#endif // mac + +#ifdef HAVE_FEENABLEEXCEPT + // this uses a glibc extension to trap on individual exceptions + int enable_excepts = 0; +#ifdef FE_DIVBYZERO + enable_excepts |= FE_DIVBYZERO; +#endif +#ifdef FE_INVALID + enable_excepts |= FE_INVALID; +#endif +#ifdef FE_OVERFLOW + enable_excepts |= FE_OVERFLOW; +#endif + feenableexcept(enable_excepts); +#endif + +#ifdef HAVE_FEDISABLEEXCEPT + // this uses a glibc extension to not trap on individual exceptions + int disable_excepts = 0; +#ifdef FE_UNDERFLOW + disable_excepts |= FE_UNDERFLOW; +#endif +#ifdef FE_INEXACT + disable_excepts |= FE_INEXACT; +#endif + fedisableexcept(disable_excepts); +#endif + } +} + ////////////////////////////////////////////////////////////////////// // static variables diff --git a/ttg/ttg/util/bug.h b/ttg/ttg/util/bug.h index 27212a809..df2791908 100644 --- a/ttg/ttg/util/bug.h +++ b/ttg/ttg/util/bug.h @@ -250,6 +250,15 @@ namespace ttg { } // namespace detail + /// @brief Initializes the floating point exceptions. + /// + /// Enables (if available) FE_DIVBYZERO, FE_INVALID, and FE_OVERFLOW; + /// FE_UNDERFLOW and FE_INEXACT are disabled (if available). + /// @warning This should be called from the main thread *before* any threads + /// have been created (i.e. before madness::initialize()), + /// so that all threads inherit the same floating point environment. + void initialize_fpe(); + /** * The Debugger class describes what should be done when a catastrophic * error causes unexpected program termination. It can try things such as From f05530fae941072f1a69517ceda3baf69b2d12bf Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Thu, 1 Feb 2024 00:24:59 -0500 Subject: [PATCH 4/8] can use bundled Boost.CallableTraits only if don't have real Boost and TTG_FETCH_BOOST is not set --- CMakeLists.txt | 2 +- .../ExternalDependenciesVersions.cmake | 2 +- cmake/modules/FindOrFetchBoost.cmake | 29 ++++++++++++++----- cmake/ttg-config.cmake.in | 22 ++++---------- ttg/CMakeLists.txt | 6 ++-- 5 files changed, 32 insertions(+), 29 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 1607cfcb9..1502787ec 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -58,7 +58,7 @@ option(TTG_EXAMPLES "Whether to build examples" OFF) option(TTG_ENABLE_ASAN "Whether to enable address sanitizer" OFF) option(TTG_FETCH_BOOST "Whether to fetch+build Boost, if missing" OFF) -option(TTG_IGNORE_BUNDLED_EXTERNALS "Whether to skip installation and use of bundled external depenedencies (Boost.CallableTraits)" OFF) +option(TTG_IGNORE_BUNDLED_EXTERNALS "Whether to skip installation and use of bundled external dependencies (Boost.CallableTraits)" OFF) option(TTG_ENABLE_TRACE "Whether to enable ttg::trace() output" OFF) # See https://medium.com/@alasher/colored-c-compiler-output-with-ninja-clang-gcc-10bfe7f2b949 option (FORCE_COLORED_OUTPUT "Always produce ANSI-colored output (GNU/Clang only)." TRUE) diff --git a/cmake/modules/ExternalDependenciesVersions.cmake b/cmake/modules/ExternalDependenciesVersions.cmake index b3d7c8e55..d4bc3bea5 100644 --- a/cmake/modules/ExternalDependenciesVersions.cmake +++ b/cmake/modules/ExternalDependenciesVersions.cmake @@ -3,7 +3,7 @@ # need Boost.CallableTraits (header only, part of Boost 1.66 released in Dec 2017) for wrap.h to work set(TTG_TRACKED_BOOST_VERSION 1.66) -set(TTG_TRACKED_VG_CMAKE_KIT_TAG d5c0a6f9ff6dc97cbb5132912733e1eb1cf73f1e) # used to provide "real" FindOrFetchBoost +set(TTG_TRACKED_VG_CMAKE_KIT_TAG 7ea2d4d3f8854b9e417f297fd74d6fc49aa13fd5) # used to provide "real" FindOrFetchBoost set(TTG_TRACKED_CATCH2_VERSION 3.5.0) set(TTG_TRACKED_CEREAL_VERSION 1.3.0) set(TTG_TRACKED_MADNESS_TAG 8788aea9758bfe6479cc23d39e6c77b7528009db) diff --git a/cmake/modules/FindOrFetchBoost.cmake b/cmake/modules/FindOrFetchBoost.cmake index ed1fc9b2a..dc1a0a654 100644 --- a/cmake/modules/FindOrFetchBoost.cmake +++ b/cmake/modules/FindOrFetchBoost.cmake @@ -1,15 +1,31 @@ # Boost can be discovered by every (sub)package but only the top package can *build* it ... -# in either case must declare the components used by TA +# in either case must declare the components used by TTG set(required_components headers callable_traits ) +set(optional_components +) if (TTG_PARSEC_USE_BOOST_SERIALIZATION) - list(APPEND required_components + list(APPEND optional_components serialization iostreams ) endif() + +# if not allowed to fetch Boost make all Boost optional +if (NOT DEFINED Boost_FETCH_IF_MISSING AND TTG_FETCH_BOOST) + set(Boost_FETCH_IF_MISSING 1) +endif() +if (NOT Boost_FETCH_IF_MISSING) + foreach(__component IN LISTS required_components) + list(APPEND optional_components + ${__component} + ) + endforeach() + set(required_components ) +endif() + if (DEFINED Boost_REQUIRED_COMPONENTS) list(APPEND Boost_REQUIRED_COMPONENTS ${required_components}) @@ -17,8 +33,6 @@ if (DEFINED Boost_REQUIRED_COMPONENTS) else() set(Boost_REQUIRED_COMPONENTS "${required_components}" CACHE STRING "Components of Boost to discovered or built") endif() -set(optional_components -) if (DEFINED Boost_OPTIONAL_COMPONENTS) list(APPEND Boost_OPTIONAL_COMPONENTS ${optional_components} @@ -28,10 +42,6 @@ else() set(Boost_OPTIONAL_COMPONENTS "${optional_components}" CACHE STRING "Optional components of Boost to discovered or built") endif() -if (NOT DEFINED Boost_FETCH_IF_MISSING AND TTG_FETCH_BOOST) - set(Boost_FETCH_IF_MISSING 1) -endif() - # Bring ValeevGroup cmake toolkit, if not yet available if (NOT DEFINED vg_cmake_kit_SOURCE_DIR) include(FetchContent) @@ -53,3 +63,6 @@ if (NOT DEFINED vg_cmake_kit_SOURCE_DIR) endif() include(${vg_cmake_kit_SOURCE_DIR}/modules/FindOrFetchBoost.cmake) +if (TARGET Boost::headers) + set(TTG_HAS_BOOST 1) +endif() diff --git a/cmake/ttg-config.cmake.in b/cmake/ttg-config.cmake.in index 41663c806..82f7c2ae6 100644 --- a/cmake/ttg-config.cmake.in +++ b/cmake/ttg-config.cmake.in @@ -6,11 +6,10 @@ set(TTG_EXT_VERSION "@TTG_EXT_VERSION@") set(PaRSEC_CONFIG "@PaRSEC_CONFIG@") set(MADNESS_CONFIG "@MADNESS_CONFIG@") -set(Boost_CONFIG "@Boost_CONFIG@") set(CXX_COROUTINE_COMPONENT "@CXX_COROUTINE_COMPONENT@") set(TTG_TRACKED_BOOST_VERSION "@TTG_TRACKED_BOOST_VERSION@") - +set(TTG_HAS_BOOST @TTG_HAS_BOOST@) set(TTG_IGNORE_BUNDLED_EXTERNALS @TTG_IGNORE_BUNDLED_EXTERNALS@) # make TTG CMake modules discoverable + load AddTTGExecutable by default @@ -19,6 +18,10 @@ include(AddTTGExecutable) @PACKAGE_INIT@ +if (TTG_HAS_BOOST) +@Boost_CONFIG_FILE_CONTENTS@ +endif(TTG_HAS_BOOST) + if (NOT TARGET MADworld AND MADNESS_CONFIG) get_filename_component(MADNESS_CONFIG_DIR "${MADNESS_CONFIG}" DIRECTORY) find_package(MADNESS 0.10.1 CONFIG QUIET REQUIRED COMPONENTS world PATHS "${MADNESS_CONFIG_DIR}" NO_DEFAULT_PATH) @@ -29,12 +32,6 @@ if (NOT TARGET PaRSEC::parsec AND PaRSEC_CONFIG) find_package(PaRSEC CONFIG QUIET REQUIRED COMPONENTS parsec PATHS "${PaRSEC_CONFIG_DIR}" NO_DEFAULT_PATH) endif() -# if Boost was discovered and used at TTG configure time discover Boost at the same path -if (NOT TARGET Boost::boost AND Boost_CONFIG) - get_filename_component(Boost_CONFIG_DIR "${Boost_CONFIG}" DIRECTORY) - find_package(Boost ${TTG_TRACKED_BOOST_VERSION} CONFIG QUIET REQUIRED OPTIONAL_COMPONENTS serialization PATHS "${Boost_CONFIG_DIR}" NO_DEFAULT_PATH) -endif() - # if C++ coroutines were used discover same version of them if (NOT TARGET std::coroutine AND CXX_COROUTINE_COMPONENT) find_package(CXXStdCoroutine MODULE QUIET REQUIRED COMPONENTS "${CXX_COROUTINE_COMPONENT}") @@ -48,13 +45,4 @@ if(NOT TARGET ttg) endif() endif() -# if don't have Boost, use bundled Boost.CallableTraits -if (NOT TARGET Boost::boost) - if (TTG_IGNORE_BUNDLED_EXTERNALS) - find_package(Boost ${TTG_TRACKED_BOOST_VERSION} QUIET REQUIRED) - else() - target_compile_definitions(ttg INTERFACE TTG_USE_BUNDLED_BOOST_CALLABLE_TRAITS=1) - endif() -endif() - set(TTG_FOUND TRUE) diff --git a/ttg/CMakeLists.txt b/ttg/CMakeLists.txt index 5d40c5b20..307135ca0 100644 --- a/ttg/CMakeLists.txt +++ b/ttg/CMakeLists.txt @@ -101,8 +101,10 @@ if (TARGET Boost::headers) list(APPEND ttg-deps Boost::headers) endif() else () # if Boost::headers is missing must use bundled Boost.CallableTraits - list(APPEND ttg-defs "$") - list(APPEND ttg-incs "$") + list(APPEND ttg-defs "TTG_USE_BUNDLED_BOOST_CALLABLE_TRAITS=1") + list(APPEND ttg-incs + "$" + "$") endif () if (TARGET TTG_Libunwind) list(APPEND ttg-deps TTG_Libunwind) From 9765f756657a38870334343603ca66ce4f46f5c7 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Thu, 1 Feb 2024 03:21:55 -0500 Subject: [PATCH 5/8] bump MADNESS tag to fix up CI issues --- cmake/modules/ExternalDependenciesVersions.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmake/modules/ExternalDependenciesVersions.cmake b/cmake/modules/ExternalDependenciesVersions.cmake index d4bc3bea5..1e55b870e 100644 --- a/cmake/modules/ExternalDependenciesVersions.cmake +++ b/cmake/modules/ExternalDependenciesVersions.cmake @@ -6,7 +6,7 @@ set(TTG_TRACKED_BOOST_VERSION 1.66) set(TTG_TRACKED_VG_CMAKE_KIT_TAG 7ea2d4d3f8854b9e417f297fd74d6fc49aa13fd5) # used to provide "real" FindOrFetchBoost set(TTG_TRACKED_CATCH2_VERSION 3.5.0) set(TTG_TRACKED_CEREAL_VERSION 1.3.0) -set(TTG_TRACKED_MADNESS_TAG 8788aea9758bfe6479cc23d39e6c77b7528009db) +set(TTG_TRACKED_MADNESS_TAG 2eb3bcf0138127ee2dbc651f1aabd3e9b0def4e3) set(TTG_TRACKED_PARSEC_TAG 25d1931e863b6741e453112d2117d85ad32e7fba) set(TTG_TRACKED_BTAS_TAG 4e8f5233aa7881dccdfcc37ce07128833926d3c2) set(TTG_TRACKED_TILEDARRAY_TAG 493c109379a1b64ddd5ef59f7e33b95633b68d73) From b70444ca12ec180e107c44e44011a5743e7c5454 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Thu, 1 Feb 2024 03:25:45 -0500 Subject: [PATCH 6/8] [ci] install libboost-random-dev to help TA build itself --- .github/workflows/cmake.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/cmake.yml b/.github/workflows/cmake.yml index 8a22b24dd..68933c61c 100644 --- a/.github/workflows/cmake.yml +++ b/.github/workflows/cmake.yml @@ -53,7 +53,7 @@ jobs: wget -O - https://apt.kitware.com/keys/kitware-archive-latest.asc 2>/dev/null | gpg --dearmor - | sudo tee /etc/apt/trusted.gpg.d/kitware.gpg >/dev/null sudo apt-add-repository "deb https://apt.kitware.com/ubuntu/ $(lsb_release -cs) main" sudo apt-get update - sudo apt-get -y install ninja-build g++-12 liblapack-dev libboost-dev libboost-serialization-dev libeigen3-dev openmpi-bin libopenmpi-dev libtbb-dev ccache flex bison cmake doxygen + sudo apt-get -y install ninja-build g++-12 liblapack-dev libboost-dev libboost-serialization-dev libboost-random-dev libeigen3-dev openmpi-bin libopenmpi-dev libtbb-dev ccache flex bison cmake doxygen - name: Create Build Environment # Some projects don't allow in-source building, so create a separate build directory From 1ffaf6aa00b96a39bd74456f1effda222ae1ed20 Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Fri, 2 Feb 2024 09:36:24 -0500 Subject: [PATCH 7/8] implemented Read{A,B} that read A and B more intelligently --- examples/spmm/spmm.cc | 147 ++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 140 insertions(+), 7 deletions(-) diff --git a/examples/spmm/spmm.cc b/examples/spmm/spmm.cc index 94edf672b..f3ae517b6 100644 --- a/examples/spmm/spmm.cc +++ b/examples/spmm/spmm.cc @@ -163,7 +163,13 @@ inline int ijk2rank(int i, int j, int k, int P, int Q, int R) { return rank; } -// flow data from an existing SpMatrix on rank 0 +/// Pushes out data from an existing SpMatrix whose data is distributed on a 2-d grid. + +/// Data is pushed in the order of the appearance of the data in the container, without any tailoring to +/// the order in which the data is consumed; thus this is likely to generate tasks in a suboptimal order. +/// \note Reading should in general occur in the same order as the data will be consumed. +/// If all consuming tasks can execute concurrently this should be OK, albeit the runtime will likely throttle +/// sends, thus task dependencies further "down" the DAG may result in some reading orders being better than others template &)>> class Read_SpMatrix : public TT, std::tuple, Blk>>, Read_SpMatrix, ttg::typelist> { public: @@ -174,18 +180,142 @@ class Read_SpMatrix : public TT, std::tuple, Blk>>, Read_SpMat ij_keymap) , matrix_(matrix) {} - void op(const Key<2> &, std::tuple, Blk>> &out) { + // key is this process' coordinate in the 2-d grid of processes (managed by ij_keymap) ... + // but it's not used at all since all this TT does is generate consuming tasks that use local tiles ... + // the consumers better use same keymap (ij_keymap) as this TT to avoid for the data flow from this to be local + void op(const Key<2> & /* pq */, std::tuple, Blk>> &out) { + auto rank = ttg::default_execution_context().rank(); + // this code assumes col-major layout + static_assert(SpMatrix::IsRowMajor == false, "SpMatrix must be col-major"); + for (int j = 0; j < matrix_.outerSize(); ++j) { + for (typename SpMatrix::InnerIterator it(matrix_, j); it; ++it) { + assert(j == it.col()); + const auto i = it.row(); + // IF the receiver uses the same keymap, these sends are local + if (rank == this->get_keymap()(Key<2>(std::initializer_list({i, j})))) { + ::send<0>(Key<2>(std::initializer_list({i, j})), it.value(), out); + } + } + } + } + + private: + const SpMatrix &matrix_; +}; + +enum class ReadSchedule { + SingleK, // appropriate for 2D (see ReadA) + MultipleK // appropriate for 3D (see ReadA) +}; +// change this to control the schedule of sends (2-D vs 3-D) +constexpr auto DefaultReadSchedule = ReadSchedule::SingleK; + +/// flow data from A distributed on a 2-d grid of processes in the order they are likely to be consumed + +/// The order of sends needs to tailored as follows: +/// - for 2-D SUMMA (R=1): read A[i][k] for all i and k=0 first, then k=1, etc. Clearly, Read_SpMatrix is going to +/// generate reads in the wrong (transposed) order (send all k for i=0, then for i=1, etc.). +/// - for 2.5/3-D SUMMA (R>1): same order of sends as for 2-D SUMMA will be suboptimal since all k=0 will only +/// generate work on the r=0 process plane. Instead we *may* want to one tile needed on each plane, then one more for +/// each plane. Hence the need for ReadSchedule. +template &)>> +class ReadA : public TT, std::tuple, Blk>>, ReadA, ttg::typelist> { + public: + using baseT = typename ReadA::ttT; + ReadA(const SpMatrix &matrix, Edge> &ctl, Edge, Blk> &out, const Keymap2 &ij_keymap, long R) + : baseT(edges(ctl), edges(out), std::string("SpMM25D::read_a"), {"ctl"}, {"a_ik"}, ij_keymap) + , matrix_(matrix) + , R_(R) {} + + // key is this process' coordinate in the 2-d grid of processes (managed by ij_keymap) ... + // but it's not used at all since all this TT does is generate consuming tasks that use local tiles ... + // the consumers better use same keymap (ij_keymap) as this TT to avoid for the data flow from this to be local + void op(const Key<2> & /* pq */, std::tuple, Blk>> &out) { + auto rank = ttg::default_execution_context().rank(); + const int I = matrix_.rows(); + const int K = matrix_.cols(); + + // this assumes col-major layout of SpMatrix + static_assert(SpMatrix::IsRowMajor == false, "SpMatrix must be col-major"); + + // MultipleK schedule is not yet correctly implemented + static_assert(KSchedule == ReadSchedule::SingleK, "MultipleK schedule not yet implemented"); + + // loop over blocks of k at a time, block size controlled by KSchedule + const int k_blk_size = (KSchedule == ReadSchedule::SingleK) ? 1 : R_; + for (std::pair k_blk = {0, std::min(k_blk_size, K)}; k_blk.first < K; + k_blk = {k_blk.first + k_blk_size, std::min(k_blk.first + k_blk_size + k_blk_size, K)}) { + + // N.B. : due to the CSC layout of A iterating over (blocks of) columns is efficient + for (int k = k_blk.first; k < k_blk.second; ++k) { + for (typename SpMatrix::InnerIterator it(matrix_, k); it; ++it) { + assert(k == it.col()); + const auto i = it.row(); + // IF the receiver uses the same keymap, these sends are local + if (rank == this->get_keymap()(Key<2>(std::initializer_list({i, k})))) { + ::send<0>(Key<2>(std::initializer_list({i, k})), it.value(), out); + } + } + } + + } + + } + + private: + const SpMatrix &matrix_; + long R_; +}; + +// flow data from an existing SpMatrix on rank 0 +template &)>> +class ReadB : public TT, std::tuple, Blk>>, ReadB, ttg::typelist> { + public: + using baseT = typename ReadB::ttT; + ReadB(const SpMatrix &matrix, Edge> &ctl, Edge, Blk> &out, const Keymap2 &ij_keymap, long R) + : baseT(edges(ctl), edges(out), std::string("read_b"), {"ctl"}, {"b_kj"}, ij_keymap), matrix_(matrix), R_(R) {} + + // key is this process' coordinate in the 2-d grid of processes (managed by ij_keymap) ... + // but it's not used at all since all this TT does is generate consuming tasks that use local tiles ... + // the consumers better use same keymap (ij_keymap) as this TT to avoid for the data flow from this to be local + void op(const Key<2> & /* pq */, std::tuple, Blk>> &out) { auto rank = ttg::default_execution_context().rank(); - for (int k = 0; k < matrix_.outerSize(); ++k) { - for (typename SpMatrix::InnerIterator it(matrix_, k); it; ++it) { - if (rank == this->get_keymap()(Key<2>(std::initializer_list({it.row(), it.col()})))) - ::send<0>(Key<2>(std::initializer_list({it.row(), it.col()})), it.value(), out); + const int J = matrix_.cols(); + const int K = matrix_.rows(); + + // this assumes col-major layout of SpMatrix + static_assert(SpMatrix::IsRowMajor == false, "SpMatrix must be col-major"); + + // MultipleK schedule is not yet correctly implemented + static_assert(KSchedule == ReadSchedule::SingleK, "MultipleK schedule not yet implemented"); + + // loop over blocks of k at a time, block size controlled by KSchedule + const int k_blk_size = (KSchedule == ReadSchedule::SingleK) ? 1 : R_; + for (std::pair k_blk = {0, std::min(k_blk_size, K)}; k_blk.first < K; + k_blk = {k_blk.first + k_blk_size, std::min(k_blk.first + k_blk_size + k_blk_size, K)}) { + + // WARNING : due to the CSC layout of B iterating over (blocks of) columns is inefficient + for (int j = 0; j < matrix_.outerSize(); ++j) { + for (typename SpMatrix::InnerIterator it(matrix_, j); it; ++it) { + assert(j == it.col()); + const auto k = it.row(); + // if k past the k block, we are done with this i + if (k >= k_blk.second) break; + // continue iterating until k has not reached this k block + if (k < k_blk.first) continue; + // IF the receiver uses the same keymap, these sends are local + if (rank == this->get_keymap()(Key<2>(std::initializer_list({k, j})))) { + ::send<0>(Key<2>(std::initializer_list({k, j})), it.value(), out); + } + } + } } } private: const SpMatrix &matrix_; + long R_; }; // flow (move?) data into an existing SpMatrix on rank 0 @@ -1153,7 +1283,7 @@ static void initBlSpRandom(const std::function &)> &keymap, siz size_t avg_nb = 0; int avg_nb_nb = 0; - struct tuple_hash : public std::unary_function, std::size_t> { + struct tuple_hash { std::size_t operator()(const std::tuple &k) const { return static_cast(std::get<0>(k)) | (static_cast(std::get<1>(k)) << 32); } @@ -1256,6 +1386,9 @@ static void timed_measurement(SpMatrix<> &A, SpMatrix<> &B, const std::function< Read_SpMatrix a("A", A, ctl, eA, ij_keymap); Read_SpMatrix b("B", B, ctl, eB, ij_keymap); + // uncomment this to use more intelligent schedule of reads +// ReadA<> a(A, ctl, eA, ij_keymap, R); +// ReadB<> b(B, ctl, eB, ij_keymap, R); Write_SpMatrix<> c(C, eC, ij_keymap); auto &c_status = c.status(); assert(!has_value(c_status)); From 3dad2c1d5fd394d17f40b506efe82d02542ec58f Mon Sep 17 00:00:00 2001 From: Eduard Valeyev Date: Fri, 2 Feb 2024 11:17:10 -0500 Subject: [PATCH 8/8] Read{A,B} can use MultipleK schedule --- examples/spmm/spmm.cc | 45 +++++++++++++++++++++++++++++-------------- 1 file changed, 31 insertions(+), 14 deletions(-) diff --git a/examples/spmm/spmm.cc b/examples/spmm/spmm.cc index f3ae517b6..ff3250862 100644 --- a/examples/spmm/spmm.cc +++ b/examples/spmm/spmm.cc @@ -238,22 +238,39 @@ class ReadA : public TT, std::tuple, Blk>>, ReadA::IsRowMajor == false, "SpMatrix must be col-major"); - // MultipleK schedule is not yet correctly implemented - static_assert(KSchedule == ReadSchedule::SingleK, "MultipleK schedule not yet implemented"); - // loop over blocks of k at a time, block size controlled by KSchedule const int k_blk_size = (KSchedule == ReadSchedule::SingleK) ? 1 : R_; + + // this keeps iterators over each k-column in this block + // there is only 1 task per process, so no need to synchronize access to this + static std::vector::InnerIterator> column_iterators; + for (std::pair k_blk = {0, std::min(k_blk_size, K)}; k_blk.first < K; k_blk = {k_blk.first + k_blk_size, std::min(k_blk.first + k_blk_size + k_blk_size, K)}) { + // for each k in the block send one A[i][k], then next i, etc. this means keep track of iterators over k-column + // for each k in the block // N.B. : due to the CSC layout of A iterating over (blocks of) columns is efficient + column_iterators.resize(0); for (int k = k_blk.first; k < k_blk.second; ++k) { - for (typename SpMatrix::InnerIterator it(matrix_, k); it; ++it) { - assert(k == it.col()); - const auto i = it.row(); - // IF the receiver uses the same keymap, these sends are local - if (rank == this->get_keymap()(Key<2>(std::initializer_list({i, k})))) { - ::send<0>(Key<2>(std::initializer_list({i, k})), it.value(), out); + column_iterators.emplace_back(matrix_, k); + } + + int k_remaining = k_blk.second - k_blk.first; + while(k_remaining != 0) { + for (int k = k_blk.first, k_in_blk = 0; k < k_blk.second; ++k, ++k_in_blk) { + if (auto& it = column_iterators[k_in_blk]) { + assert(k == it.col()); + const auto i = it.row(); + // IF the receiver uses the same keymap, these sends are local + if (rank == this->get_keymap()(Key<2>(std::initializer_list({i, k})))) { + ::send<0>(Key<2>(std::initializer_list({i, k})), it.value(), out); + } + ++it; + } + else { // this k is done + assert(k_remaining != 0); + --k_remaining; } } } @@ -286,13 +303,10 @@ class ReadB : public TT, std::tuple, Blk>>, ReadB::IsRowMajor == false, "SpMatrix must be col-major"); - // MultipleK schedule is not yet correctly implemented - static_assert(KSchedule == ReadSchedule::SingleK, "MultipleK schedule not yet implemented"); - // loop over blocks of k at a time, block size controlled by KSchedule const int k_blk_size = (KSchedule == ReadSchedule::SingleK) ? 1 : R_; for (std::pair k_blk = {0, std::min(k_blk_size, K)}; k_blk.first < K; - k_blk = {k_blk.first + k_blk_size, std::min(k_blk.first + k_blk_size + k_blk_size, K)}) { + k_blk = {k_blk.first + k_blk_size, std::min(k_blk.first + k_blk_size + k_blk_size, K)}) { // WARNING : due to the CSC layout of B iterating over (blocks of) columns is inefficient for (int j = 0; j < matrix_.outerSize(); ++j) { @@ -580,7 +594,7 @@ class SpMM25D { const Keymap3 &ijk_keymap_; }; // class BcastB - /// multiply task has 3 input flows: a_ijk, b_ijk, and c_ijk, c_ijk contains the running total for this kayer of the + /// multiply task has 3 input flows: a_ijk, b_ijk, and c_ijk, c_ijk contains the running total for this layer of the /// 3-D process grid only class MultiplyAdd : public TT, std::tuple, Blk>, Out, Blk>>, MultiplyAdd, ttg::typelist> { @@ -1691,6 +1705,9 @@ int main(int argc, char **argv) { Edge, blk_t> eA, eB, eC; Read_SpMatrix a("A", A, ctl, eA, ij_keymap); Read_SpMatrix b("B", B, ctl, eB, ij_keymap); + // uncomment this to use more intelligent schedule of reads +// ReadA<> a(A, ctl, eA, ij_keymap, R); +// ReadB<> b(B, ctl, eB, ij_keymap, R); Write_SpMatrix<> c(C, eC, keymap_write); auto &c_status = c.status(); assert(!has_value(c_status));