diff --git a/CMakeLists.txt b/CMakeLists.txt index 5c3462e3f..537bd8d2d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -99,6 +99,9 @@ set(PORTBLAS_GENERATED_SRC ${CMAKE_CURRENT_BINARY_DIR}/generated_src) set(PORTBLAS_INCLUDE $ $) set(PORTBLAS_COMMON_INCLUDE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/common/include) +if(INSTALL_HEADER_ONLY) + set(PORTBLAS_INSTALL_SRC $ $) +endif() set(PORTBLAS_SRC ${CMAKE_CURRENT_SOURCE_DIR}/src) set(PORTBLAS_SRC_GENERATOR ${CMAKE_CURRENT_SOURCE_DIR}/python_generator) list(APPEND THIRD_PARTIES_INCLUDE ${CBLAS_INCLUDE}) @@ -142,8 +145,24 @@ include(CmakeFunctionHelper) if (INSTALL_HEADER_ONLY) add_library(portblas INTERFACE) set_target_properties(portblas PROPERTIES - INTERFACE_INCLUDE_DIRECTORIES "${PORTBLAS_INCLUDE};$" - ) + INTERFACE_INCLUDE_DIRECTORIES "${PORTBLAS_INCLUDE};${PORTBLAS_INSTALL_SRC}") + include(CheckCXXCompilerFlag) + check_cxx_compiler_flag("-fsycl" is_dpcpp) + if(is_dpcpp) + target_compile_definitions(portblas INTERFACE "SB_ENABLE_USM") + endif() + if(${BLAS_ENABLE_COMPLEX}) + target_compile_definitions(portblas INTERFACE "BLAS_ENABLE_COMPLEX") + endif() + target_compile_definitions(portblas INTERFACE ${TUNING_TARGET}) + target_compile_options(portblas INTERFACE -Wno-deprecated-declarations) + target_compile_options(portblas INTERFACE -Wno-deprecated-copy-with-user-provided-copy) + if((${CMAKE_CXX_COMPILER_ID} STREQUAL "IntelLLVM") AND NOT + (${TUNING_TARGET} STREQUAL "INTEL_GPU") ) + target_compile_options(portblas INTERFACE -fno-fast-math) + target_compile_options(portblas INTERFACE -mllvm=-loopopt=0) + message(STATUS "Adding -fno-fast-math -mllvm=-loopopt=0 to portblas") + endif() else() add_subdirectory(src) build_library(portblas ${BLAS_ENABLE_EXTENSIONS}) @@ -202,7 +221,7 @@ install(DIRECTORY ${PORTBLAS_INCLUDE} FILES_MATCHING PATTERN "*.h" ) if (INSTALL_HEADER_ONLY) - install(DIRECTORY ${PORTBLAS_SRC} + install(DIRECTORY ${PORTBLAS_INSTALL_SRC} DESTINATION ${CMAKE_INSTALL_PREFIX} COMPONENT portblas FILES_MATCHING PATTERN "*.hpp" diff --git a/README.md b/README.md index 3752e5bf7..cd25f3ab9 100644 --- a/README.md +++ b/README.md @@ -401,8 +401,7 @@ to clone submodule(s). ### Compile with DPC++ ```bash -export CC=[path/to/intel/clang] -export CXX=[path/to/intel/clang++] +export CXX=[path/to/intel/icpx] cd build cmake -GNinja ../ -DSYCL_COMPILER=dpcpp ninja @@ -417,6 +416,13 @@ advisable for NVIDIA and **mandatory for AMD** to provide the specific device architecture through `-DDPCPP_SYCL_ARCH=`, e.g., `` can be `sm_80` for NVIDIA or `gfx908` for AMD. +#### DPC++ Compiler Support + +As DPCPP SYCL compiler the project is fully compatible with `icpx` provided by +intel [oneAPI base-toolkit](https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit.html#gs.7t6x52) +which is the suggested one. PortBLAS can be compiled also with the [open source intel/llvm](https://github.com/intel/llvm) +compiler, but not all the latest changes are tested. + ### Compile with AdaptiveCpp *(Formerly hipSYCL)* The following instructions concern the **generic** *(clang-based)* flow supported by AdaptiveCpp. diff --git a/benchmark/portblas/CMakeLists.txt b/benchmark/portblas/CMakeLists.txt index 8bf2c3164..6245bbda8 100644 --- a/benchmark/portblas/CMakeLists.txt +++ b/benchmark/portblas/CMakeLists.txt @@ -134,6 +134,12 @@ foreach(portblas_bench ${sources}) target_link_libraries(bench_${bench_exec} PRIVATE blas::blas) endif() + # Cmake identifies compilers as IntelLLVM only those distributed with Intel oneAPI releases, + # so this flag doesn't apply to intel/llvm open source compiler. + if (${CMAKE_CXX_COMPILER_ID} STREQUAL "IntelLLVM") + target_compile_options(bench_${bench_exec} PRIVATE "-fno-fast-math") + endif() + message(STATUS "Created benchmark: ${bench_exec}") install(TARGETS bench_${bench_exec} RUNTIME diff --git a/cmake/Modules/FindDPCPP.cmake b/cmake/Modules/FindDPCPP.cmake index f8eec161b..95549a3fd 100644 --- a/cmake/Modules/FindDPCPP.cmake +++ b/cmake/Modules/FindDPCPP.cmake @@ -96,6 +96,14 @@ function(add_sycl_to_target) "${multi_value_args}" ${ARGN} ) + # Cmake identifies as IntelLLVM compiler only those distributed with intel oneAPI releases, + # so this flag doesn't apply to intel/llvm open source compiler. + if((${CMAKE_CXX_COMPILER_ID} STREQUAL "IntelLLVM") AND NOT + (${TUNING_TARGET} STREQUAL "INTEL_GPU") ) + target_compile_options(${SB_ADD_SYCL_TARGET} PRIVATE -fno-fast-math) + target_compile_options(${SB_ADD_SYCL_TARGET} PRIVATE -mllvm=-loopopt=0) + message(STATUS "Adding -fno-fast-math -mllvm=-loopopt=0 to target ${SB_ADD_SYCL_TARGET}") + endif() target_compile_options(${SB_ADD_SYCL_TARGET} PUBLIC ${DPCPP_FLAGS}) get_target_property(target_type ${SB_ADD_SYCL_TARGET} TYPE) if (NOT target_type STREQUAL "OBJECT_LIBRARY") diff --git a/src/operations/blas1/IndexMaxMin.hpp b/src/operations/blas1/IndexMaxMin.hpp index d40367d6c..97b56f72b 100644 --- a/src/operations/blas1/IndexMaxMin.hpp +++ b/src/operations/blas1/IndexMaxMin.hpp @@ -98,8 +98,8 @@ PORTBLAS_INLINE void IndexMaxMin::eval( // reduction within the sub_group for (index_t i = sg_local_range >> 1; i > 0; i >>= 1) { if (sg_local_id < i) { - element_t shfl_val = sg.shuffle_down(val.get_value(), i); - index_t shfl_idx = sg.shuffle_down(val.get_index(), i); + element_t shfl_val = sycl::shift_group_left(sg, val.get_value(), i); + index_t shfl_idx = sycl::shift_group_left(sg, val.get_index(), i); value_t shfl{shfl_idx, shfl_val}; val = op::eval(val, shfl); } diff --git a/test/unittest/CMakeLists.txt b/test/unittest/CMakeLists.txt index f6044d830..a77490ff3 100644 --- a/test/unittest/CMakeLists.txt +++ b/test/unittest/CMakeLists.txt @@ -101,13 +101,6 @@ if(is_dpcpp) ) endif() - -# Contains tests that fail if compiled with -ffast-math -set(SYCL_UNITTEST_NOFASTMATH - ${PORTBLAS_UNITTEST}/blas1/blas1_rotg_test.cpp - ${PORTBLAS_UNITTEST}/blas1/blas1_rotmg_test.cpp -) - if(GEMM_TALL_SKINNY_SUPPORT) list(APPEND SYCL_UNITTEST_SRCS ${PORTBLAS_UNITTEST}/blas3/blas3_gemm_tall_skinny_test.cpp) endif() @@ -142,8 +135,9 @@ foreach(blas_test ${SYCL_UNITTEST_SRCS}) target_link_libraries(${test_exec} PRIVATE gtest_main Clara::Clara blas::blas portblas) target_include_directories(${test_exec} PRIVATE ${CBLAS_INCLUDE} ${PORTBLAS_COMMON_INCLUDE_DIR}) - list (FIND SYCL_UNITTEST_NOFASTMATH ${blas_test} _index) - if (${_index} GREATER -1) + # Cmake identifies compilers as IntelLLVM only those distributed with Intel oneAPI releases, + # so this flag doesn't apply to intel/llvm open source compiler. + if (${CMAKE_CXX_COMPILER_ID} STREQUAL "IntelLLVM") target_compile_options(${test_exec} PRIVATE "-fno-fast-math") endif()